Commit
•
54c440a
1
Parent(s):
391c14d
Update pipeline explorer
Browse files- .gitignore +1 -0
- README.md +1 -1
- app.py +141 -0
- requirements.txt +6 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.env
|
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title: Distilabel Synthetic Data Pipeline Explorer
|
3 |
emoji: 🦀
|
4 |
colorFrom: purple
|
5 |
colorTo: yellow
|
|
|
1 |
---
|
2 |
+
title: Distilabel Synthetic Data Pipeline Explorer p
|
3 |
emoji: 🦀
|
4 |
colorFrom: purple
|
5 |
colorTo: yellow
|
app.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import urllib
|
3 |
+
from typing import Iterable
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
import markdown as md
|
7 |
+
import pandas as pd
|
8 |
+
from distilabel.cli.pipeline.utils import _build_pipeline_panel, get_pipeline
|
9 |
+
from gradio_huggingfacehub_search import HuggingfaceHubSearch
|
10 |
+
from gradio_leaderboard import ColumnFilter, Leaderboard, SearchColumns, SelectColumns
|
11 |
+
from gradio_modal import Modal
|
12 |
+
from huggingface_hub import HfApi, HfFileSystem, RepoCard
|
13 |
+
from huggingface_hub.hf_api import DatasetInfo
|
14 |
+
|
15 |
+
# Initialize the Hugging Face API
|
16 |
+
api = HfApi()
|
17 |
+
|
18 |
+
example = HuggingfaceHubSearch().example_value()
|
19 |
+
fs = HfFileSystem()
|
20 |
+
|
21 |
+
def _categorize_dtypes(df):
|
22 |
+
dtype_mapping = {
|
23 |
+
'int64': 'number',
|
24 |
+
'float64': 'number',
|
25 |
+
'bool': 'bool',
|
26 |
+
'datetime64[ns]': 'date',
|
27 |
+
'datetime64[ns, UTC]': 'date',
|
28 |
+
'object': 'str'
|
29 |
+
}
|
30 |
+
|
31 |
+
categorized_dtypes = []
|
32 |
+
for column, dtype in df.dtypes.items():
|
33 |
+
dtype_str = str(dtype)
|
34 |
+
if dtype_str in dtype_mapping:
|
35 |
+
categorized_dtypes.append(dtype_mapping[dtype_str])
|
36 |
+
else:
|
37 |
+
categorized_dtypes.append('markdown')
|
38 |
+
return categorized_dtypes
|
39 |
+
|
40 |
+
def _get_tag_category(entry: list[str], tag_category: str):
|
41 |
+
for item in entry:
|
42 |
+
if tag_category in item:
|
43 |
+
return item.split(f"{tag_category}:")[-1]
|
44 |
+
else:
|
45 |
+
return None
|
46 |
+
|
47 |
+
def _has_pipeline(repo_id):
|
48 |
+
file_path = f"datasets/{repo_id}/pipeline.log"
|
49 |
+
url = "https://huggingface.co/{file_path}"
|
50 |
+
if fs.exists(file_path):
|
51 |
+
pipeline = get_pipeline(url)
|
52 |
+
return str(_build_pipeline_panel(pipeline))
|
53 |
+
else:
|
54 |
+
return ""
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
async def check_pipelines(repo_ids):
|
59 |
+
tasks = [_has_pipeline(fs, repo_id) for repo_id in repo_ids]
|
60 |
+
results = await asyncio.gather(*tasks)
|
61 |
+
|
62 |
+
return dict(zip(repo_ids, results))
|
63 |
+
|
64 |
+
def _search_distilabel_repos(query: str = None,):
|
65 |
+
filter = "library:distilabel"
|
66 |
+
if query:
|
67 |
+
filter = f"{filter}&search={urllib.urlencode(query)}"
|
68 |
+
datasets: Iterable[DatasetInfo] = api.list_datasets(filter=filter)
|
69 |
+
data = [ex.__dict__ for ex in datasets]
|
70 |
+
df = pd.DataFrame.from_records(data)
|
71 |
+
df["size_categories"] = df.tags.apply(_get_tag_category, args=["size_categories"])
|
72 |
+
# df["has_pipeline"] = asyncio.run(check_pipelines(df.id.tolist()))
|
73 |
+
df["has_pipeline"] = ""
|
74 |
+
subset_columns = ['id', 'likes', 'downloads', "size_categories", 'has_pipeline', 'last_modified', 'description']
|
75 |
+
new_column_order = subset_columns + [col for col in df.columns if col not in subset_columns]
|
76 |
+
df = df[new_column_order]
|
77 |
+
|
78 |
+
return df
|
79 |
+
|
80 |
+
def _create_modal_info(row: dict) -> str:
|
81 |
+
def _get_main_title(repo_id):
|
82 |
+
return f'<h1> <a href="https://huggingface.co/datasets/{repo_id}">{repo_id}</a> </h1>'
|
83 |
+
def _embed_dataset_viewer(repo_id):
|
84 |
+
return (
|
85 |
+
f"""<iframe src="https://huggingface.co/datasets/{repo_id}/embed/viewer" frameborder="0" width="100%" height="560px"></iframe>"""
|
86 |
+
)
|
87 |
+
def _get_dataset_card(repo_id):
|
88 |
+
return md.markdown(RepoCard.load(repo_id_or_path=repo_id, repo_type="dataset").text)
|
89 |
+
|
90 |
+
return "<br>".join([
|
91 |
+
_get_main_title(repo_id=row["id"]),
|
92 |
+
f'pipeline available: {_has_pipeline(repo_id=row["id"])}',
|
93 |
+
_embed_dataset_viewer(repo_id=row["id"]),
|
94 |
+
_get_dataset_card(repo_id=row["id"]),
|
95 |
+
])
|
96 |
+
|
97 |
+
# Define the Gradio interface
|
98 |
+
with gr.Blocks(delete_cache=[1,1]) as demo:
|
99 |
+
gr.Markdown("# ⚗️ Distilabel Synthetic Data Pipeline Finder")
|
100 |
+
gr.HTML("Select a dataset to show the pipeline, dataset viewer and model card.")
|
101 |
+
df: pd.DataFrame = _search_distilabel_repos()
|
102 |
+
|
103 |
+
leader_board = Leaderboard(
|
104 |
+
value=df,
|
105 |
+
datatype=_categorize_dtypes(df),
|
106 |
+
search_columns=SearchColumns(primary_column="id", secondary_columns=["description", "author"],
|
107 |
+
placeholder="Search by id, description or author. To search by description or author, type 'description:<query>', 'author:<query>'",
|
108 |
+
label="Search"),
|
109 |
+
filter_columns=[
|
110 |
+
ColumnFilter("likes", type="slider", min=0, max=df.likes.max(), default=[0, df.likes.max()]),
|
111 |
+
ColumnFilter("downloads", type="slider", min=0, max=df.downloads.max(), default=[0, df.downloads.max()]),
|
112 |
+
ColumnFilter("size_categories", type="checkboxgroup"),
|
113 |
+
ColumnFilter("has_pipeline", type="checkboxgroup"),
|
114 |
+
],
|
115 |
+
hide_columns=[
|
116 |
+
"_id", "private", "gated", "disabled", "sha", "downloads_all_time", "paperswithcode_id", "tags", "siblings",
|
117 |
+
"cardData", "lastModified", "card_data", "key"],
|
118 |
+
select_columns=SelectColumns(default_selection=["id", "last_modified", "downloads", "likes", "size_categories"],
|
119 |
+
cant_deselect=["id"],
|
120 |
+
label="Select The Columns",
|
121 |
+
info="Helpful information"),
|
122 |
+
)
|
123 |
+
|
124 |
+
with Modal() as modal:
|
125 |
+
markdown = gr.HTML(value="test")
|
126 |
+
|
127 |
+
def update(leader_board, markdown, evt: gr.SelectData):
|
128 |
+
if not isinstance(evt.index, int):
|
129 |
+
index = evt.index[0] # Assuming evt.index is a list or similar structure
|
130 |
+
markdown = _create_modal_info(row=leader_board.iloc[index].to_dict())
|
131 |
+
modal = Modal(visible=True)
|
132 |
+
return leader_board, markdown, modal
|
133 |
+
else:
|
134 |
+
return leader_board, markdown
|
135 |
+
|
136 |
+
leader_board.select(update, [leader_board, markdown], [leader_board, markdown, modal], show_progress="hidden")
|
137 |
+
|
138 |
+
|
139 |
+
if __name__ == "__main__":
|
140 |
+
demo.launch()
|
141 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
huggingface_hub
|
2 |
+
gradio
|
3 |
+
distilabel[openai]
|
4 |
+
gradio_modal
|
5 |
+
gradio_leaderboard
|
6 |
+
markdown
|