davidberenstein1957 HF staff commited on
Commit
54c440a
1 Parent(s): 391c14d

Update pipeline explorer

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +1 -1
  3. app.py +141 -0
  4. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: Distilabel Synthetic Data Pipeline Explorer
3
  emoji: 🦀
4
  colorFrom: purple
5
  colorTo: yellow
 
1
  ---
2
+ title: Distilabel Synthetic Data Pipeline Explorer p
3
  emoji: 🦀
4
  colorFrom: purple
5
  colorTo: yellow
app.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import urllib
3
+ from typing import Iterable
4
+
5
+ import gradio as gr
6
+ import markdown as md
7
+ import pandas as pd
8
+ from distilabel.cli.pipeline.utils import _build_pipeline_panel, get_pipeline
9
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
10
+ from gradio_leaderboard import ColumnFilter, Leaderboard, SearchColumns, SelectColumns
11
+ from gradio_modal import Modal
12
+ from huggingface_hub import HfApi, HfFileSystem, RepoCard
13
+ from huggingface_hub.hf_api import DatasetInfo
14
+
15
+ # Initialize the Hugging Face API
16
+ api = HfApi()
17
+
18
+ example = HuggingfaceHubSearch().example_value()
19
+ fs = HfFileSystem()
20
+
21
+ def _categorize_dtypes(df):
22
+ dtype_mapping = {
23
+ 'int64': 'number',
24
+ 'float64': 'number',
25
+ 'bool': 'bool',
26
+ 'datetime64[ns]': 'date',
27
+ 'datetime64[ns, UTC]': 'date',
28
+ 'object': 'str'
29
+ }
30
+
31
+ categorized_dtypes = []
32
+ for column, dtype in df.dtypes.items():
33
+ dtype_str = str(dtype)
34
+ if dtype_str in dtype_mapping:
35
+ categorized_dtypes.append(dtype_mapping[dtype_str])
36
+ else:
37
+ categorized_dtypes.append('markdown')
38
+ return categorized_dtypes
39
+
40
+ def _get_tag_category(entry: list[str], tag_category: str):
41
+ for item in entry:
42
+ if tag_category in item:
43
+ return item.split(f"{tag_category}:")[-1]
44
+ else:
45
+ return None
46
+
47
+ def _has_pipeline(repo_id):
48
+ file_path = f"datasets/{repo_id}/pipeline.log"
49
+ url = "https://huggingface.co/{file_path}"
50
+ if fs.exists(file_path):
51
+ pipeline = get_pipeline(url)
52
+ return str(_build_pipeline_panel(pipeline))
53
+ else:
54
+ return ""
55
+
56
+
57
+
58
+ async def check_pipelines(repo_ids):
59
+ tasks = [_has_pipeline(fs, repo_id) for repo_id in repo_ids]
60
+ results = await asyncio.gather(*tasks)
61
+
62
+ return dict(zip(repo_ids, results))
63
+
64
+ def _search_distilabel_repos(query: str = None,):
65
+ filter = "library:distilabel"
66
+ if query:
67
+ filter = f"{filter}&search={urllib.urlencode(query)}"
68
+ datasets: Iterable[DatasetInfo] = api.list_datasets(filter=filter)
69
+ data = [ex.__dict__ for ex in datasets]
70
+ df = pd.DataFrame.from_records(data)
71
+ df["size_categories"] = df.tags.apply(_get_tag_category, args=["size_categories"])
72
+ # df["has_pipeline"] = asyncio.run(check_pipelines(df.id.tolist()))
73
+ df["has_pipeline"] = ""
74
+ subset_columns = ['id', 'likes', 'downloads', "size_categories", 'has_pipeline', 'last_modified', 'description']
75
+ new_column_order = subset_columns + [col for col in df.columns if col not in subset_columns]
76
+ df = df[new_column_order]
77
+
78
+ return df
79
+
80
+ def _create_modal_info(row: dict) -> str:
81
+ def _get_main_title(repo_id):
82
+ return f'<h1> <a href="https://huggingface.co/datasets/{repo_id}">{repo_id}</a> </h1>'
83
+ def _embed_dataset_viewer(repo_id):
84
+ return (
85
+ f"""<iframe src="https://huggingface.co/datasets/{repo_id}/embed/viewer" frameborder="0" width="100%" height="560px"></iframe>"""
86
+ )
87
+ def _get_dataset_card(repo_id):
88
+ return md.markdown(RepoCard.load(repo_id_or_path=repo_id, repo_type="dataset").text)
89
+
90
+ return "<br>".join([
91
+ _get_main_title(repo_id=row["id"]),
92
+ f'pipeline available: {_has_pipeline(repo_id=row["id"])}',
93
+ _embed_dataset_viewer(repo_id=row["id"]),
94
+ _get_dataset_card(repo_id=row["id"]),
95
+ ])
96
+
97
+ # Define the Gradio interface
98
+ with gr.Blocks(delete_cache=[1,1]) as demo:
99
+ gr.Markdown("# ⚗️ Distilabel Synthetic Data Pipeline Finder")
100
+ gr.HTML("Select a dataset to show the pipeline, dataset viewer and model card.")
101
+ df: pd.DataFrame = _search_distilabel_repos()
102
+
103
+ leader_board = Leaderboard(
104
+ value=df,
105
+ datatype=_categorize_dtypes(df),
106
+ search_columns=SearchColumns(primary_column="id", secondary_columns=["description", "author"],
107
+ placeholder="Search by id, description or author. To search by description or author, type 'description:<query>', 'author:<query>'",
108
+ label="Search"),
109
+ filter_columns=[
110
+ ColumnFilter("likes", type="slider", min=0, max=df.likes.max(), default=[0, df.likes.max()]),
111
+ ColumnFilter("downloads", type="slider", min=0, max=df.downloads.max(), default=[0, df.downloads.max()]),
112
+ ColumnFilter("size_categories", type="checkboxgroup"),
113
+ ColumnFilter("has_pipeline", type="checkboxgroup"),
114
+ ],
115
+ hide_columns=[
116
+ "_id", "private", "gated", "disabled", "sha", "downloads_all_time", "paperswithcode_id", "tags", "siblings",
117
+ "cardData", "lastModified", "card_data", "key"],
118
+ select_columns=SelectColumns(default_selection=["id", "last_modified", "downloads", "likes", "size_categories"],
119
+ cant_deselect=["id"],
120
+ label="Select The Columns",
121
+ info="Helpful information"),
122
+ )
123
+
124
+ with Modal() as modal:
125
+ markdown = gr.HTML(value="test")
126
+
127
+ def update(leader_board, markdown, evt: gr.SelectData):
128
+ if not isinstance(evt.index, int):
129
+ index = evt.index[0] # Assuming evt.index is a list or similar structure
130
+ markdown = _create_modal_info(row=leader_board.iloc[index].to_dict())
131
+ modal = Modal(visible=True)
132
+ return leader_board, markdown, modal
133
+ else:
134
+ return leader_board, markdown
135
+
136
+ leader_board.select(update, [leader_board, markdown], [leader_board, markdown, modal], show_progress="hidden")
137
+
138
+
139
+ if __name__ == "__main__":
140
+ demo.launch()
141
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ huggingface_hub
2
+ gradio
3
+ distilabel[openai]
4
+ gradio_modal
5
+ gradio_leaderboard
6
+ markdown