davidberenstein1957 HF staff commited on
Commit
40e000b
·
1 Parent(s): 4b6f0f0

feat: add stop_sequences to magpie generation

Browse files

feat: add purple-ish theme
docs: add context
fix: remove script upload

app.py CHANGED
@@ -7,6 +7,7 @@ demo = gr.TabbedInterface(
7
  ["Supervised Fine-Tuning"],
8
  title="⚗️ Distilabel Dataset Generator",
9
  head="⚗️ Distilabel Dataset Generator",
 
10
  )
11
 
12
  if __name__ == "__main__":
 
7
  ["Supervised Fine-Tuning"],
8
  title="⚗️ Distilabel Dataset Generator",
9
  head="⚗️ Distilabel Dataset Generator",
10
+ theme="ParityError/Interstellar",
11
  )
12
 
13
  if __name__ == "__main__":
src/distilabel_dataset_generator/sft.py CHANGED
@@ -10,10 +10,12 @@ from distilabel.steps.tasks import MagpieGenerator, TextGeneration
10
 
11
  from src.distilabel_dataset_generator.utils import (
12
  OAuthToken,
 
13
  get_duplicate_button,
14
  get_login_button,
15
  get_org_dropdown,
16
  list_orgs,
 
17
  )
18
 
19
  INFORMATION_SEEKING_PROMPT = (
@@ -151,6 +153,13 @@ def _run_pipeline(result_queue, num_turns, num_rows, system_prompt, token: str =
151
  generation_kwargs={
152
  "temperature": 0.8, # it's the best value for Llama 3.1 70B Instruct
153
  "do_sample": True,
 
 
 
 
 
 
 
154
  },
155
  api_key=token,
156
  ),
@@ -229,7 +238,7 @@ def generate_dataset(
229
  distiset.push_to_hub(
230
  repo_id=repo_id,
231
  private=private,
232
- include_script=True,
233
  token=token.token,
234
  )
235
  gr.Info(f"Dataset pushed to Hugging Face Hub: https://huggingface.co/{repo_id}")
@@ -252,76 +261,92 @@ def generate_dataset(
252
  with gr.Blocks(
253
  title="⚗️ Distilabel Dataset Generator",
254
  head="⚗️ Distilabel Dataset Generator",
 
255
  ) as demo:
 
 
 
 
 
256
  with gr.Row(variant="panel"):
257
  with gr.Column():
258
  btn_login = get_login_button()
259
  with gr.Column():
260
  btn_duplicate = get_duplicate_button()
 
 
 
 
 
 
261
 
262
- dataset_description = gr.Textbox(
263
- label="Provide a description of the dataset",
264
- value=DEFAULT_SYSTEM_PROMPT_DESCRIPTION,
265
- )
266
-
267
- btn_generate_system_prompt = gr.Button(value="🧪 Generate Sytem Prompt")
268
-
269
- system_prompt = gr.Textbox(
270
- label="Provide or correct the system prompt", value=DEFAULT_SYSTEM_PROMPT
271
- )
272
-
273
- btn_generate_system_prompt.click(
274
- fn=generate_system_prompt,
275
- inputs=[dataset_description],
276
- outputs=[system_prompt],
277
- )
278
 
279
- btn_generate_sample_dataset = gr.Button(
280
- value="🧪 Generate Sample Dataset of 5 rows and a single turn",
281
- )
 
282
 
283
- table = gr.Dataframe(label="Generated Dataset", wrap=True, value=DEFAULT_DATASET)
 
 
 
 
284
 
285
- btn_generate_sample_dataset.click(
286
- fn=generate_dataset,
287
- inputs=[system_prompt],
288
- outputs=[table],
289
- )
290
 
291
- with gr.Row(variant="panel"):
292
- num_turns = gr.Number(
293
- value=1,
294
- label="Number of turns in the conversation",
295
- minimum=1,
296
- info="Whether the dataset is for a single turn with 'instruction-response' columns or a multi-turn conversation with a 'conversation' column.",
297
- )
298
- num_rows = gr.Number(
299
- value=100,
300
- label="Number of rows in the dataset",
301
- minimum=1,
302
- info="The number of rows in the dataset. Note that you are able to generate several 1000 rows at once but that this will take time.",
303
- )
304
- private = gr.Checkbox(label="Private dataset", value=True, interactive=True)
305
 
306
- with gr.Row(variant="panel"):
307
- orgs_selector = gr.Dropdown(label="Organization")
308
- dataset_name_push_to_hub = gr.Textbox(label="Dataset Name to push to Hub")
 
 
309
 
310
- btn_generate_full_dataset = gr.Button(
311
- value="⚗️ Generate Full Dataset", variant="primary"
312
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
- btn_generate_full_dataset.click(
315
- fn=generate_dataset,
316
- inputs=[
317
- system_prompt,
318
- num_turns,
319
- num_rows,
320
- private,
321
- orgs_selector,
322
- dataset_name_push_to_hub,
323
- ],
324
- )
325
 
326
  demo.load(get_org_dropdown, outputs=[orgs_selector])
 
 
327
  demo
 
10
 
11
  from src.distilabel_dataset_generator.utils import (
12
  OAuthToken,
13
+ get_css,
14
  get_duplicate_button,
15
  get_login_button,
16
  get_org_dropdown,
17
  list_orgs,
18
+ swap_visibilty,
19
  )
20
 
21
  INFORMATION_SEEKING_PROMPT = (
 
153
  generation_kwargs={
154
  "temperature": 0.8, # it's the best value for Llama 3.1 70B Instruct
155
  "do_sample": True,
156
+ "stop_sequences": [
157
+ "<|eot_id|>",
158
+ "<|end_of_text|>",
159
+ "<|start_header_id|>",
160
+ "<|end_header_id|>",
161
+ "assistant",
162
+ ],
163
  },
164
  api_key=token,
165
  ),
 
238
  distiset.push_to_hub(
239
  repo_id=repo_id,
240
  private=private,
241
+ include_script=False,
242
  token=token.token,
243
  )
244
  gr.Info(f"Dataset pushed to Hugging Face Hub: https://huggingface.co/{repo_id}")
 
261
  with gr.Blocks(
262
  title="⚗️ Distilabel Dataset Generator",
263
  head="⚗️ Distilabel Dataset Generator",
264
+ css=get_css(),
265
  ) as demo:
266
+ gr.Markdown(
267
+ """
268
+ ### Generate a high quality SFT dataset in a breeze using [🐦‍⬛MagPie](https://arxiv.org/abs/2406.08464) and [🦙Llama 3.1 - 70B Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct). More information on distilabel and techniques can be found in the "FAQ" tab. The code can be found in the [Spaces repository](https://huggingface.co/spaces/argilla/distilabel-dataset-generator/tree/main).
269
+ """
270
+ )
271
  with gr.Row(variant="panel"):
272
  with gr.Column():
273
  btn_login = get_login_button()
274
  with gr.Column():
275
  btn_duplicate = get_duplicate_button()
276
+ with gr.Row():
277
+ with gr.Column(visible=True) as main_ui:
278
+ dataset_description = gr.Textbox(
279
+ label="Provide a description of the dataset",
280
+ value=DEFAULT_SYSTEM_PROMPT_DESCRIPTION,
281
+ )
282
 
283
+ btn_generate_system_prompt = gr.Button(value="🧪 Generate Sytem Prompt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
+ system_prompt = gr.Textbox(
286
+ label="Provide or correct the system prompt",
287
+ value=DEFAULT_SYSTEM_PROMPT,
288
+ )
289
 
290
+ btn_generate_system_prompt.click(
291
+ fn=generate_system_prompt,
292
+ inputs=[dataset_description],
293
+ outputs=[system_prompt],
294
+ )
295
 
296
+ btn_generate_sample_dataset = gr.Button(
297
+ value="🧪 Generate Sample Dataset of 5 rows and a single turn",
298
+ )
 
 
299
 
300
+ table = gr.Dataframe(
301
+ label="Generated Dataset", wrap=True, value=DEFAULT_DATASET
302
+ )
 
 
 
 
 
 
 
 
 
 
 
303
 
304
+ btn_generate_sample_dataset.click(
305
+ fn=generate_dataset,
306
+ inputs=[system_prompt],
307
+ outputs=[table],
308
+ )
309
 
310
+ with gr.Row(variant="panel"):
311
+ num_turns = gr.Number(
312
+ value=1,
313
+ label="Number of turns in the conversation",
314
+ minimum=1,
315
+ info="Whether the dataset is for a single turn with 'instruction-response' columns or a multi-turn conversation with a 'conversation' column.",
316
+ )
317
+ num_rows = gr.Number(
318
+ value=100,
319
+ label="Number of rows in the dataset",
320
+ minimum=1,
321
+ info="The number of rows in the dataset. Note that you are able to generate several 1000 rows at once but that this will take time.",
322
+ )
323
+ private = gr.Checkbox(
324
+ label="Private dataset", value=True, interactive=True
325
+ )
326
+
327
+ with gr.Row(variant="panel"):
328
+ orgs_selector = gr.Dropdown(label="Organization")
329
+ dataset_name_push_to_hub = gr.Textbox(
330
+ label="Dataset Name to push to Hub"
331
+ )
332
+
333
+ btn_generate_full_dataset = gr.Button(
334
+ value="⚗️ Generate Full Dataset", variant="primary"
335
+ )
336
 
337
+ btn_generate_full_dataset.click(
338
+ fn=generate_dataset,
339
+ inputs=[
340
+ system_prompt,
341
+ num_turns,
342
+ num_rows,
343
+ private,
344
+ orgs_selector,
345
+ dataset_name_push_to_hub,
346
+ ],
347
+ )
348
 
349
  demo.load(get_org_dropdown, outputs=[orgs_selector])
350
+ demo.load(fn=swap_visibilty, outputs=main_ui)
351
+
352
  demo
src/distilabel_dataset_generator/utils.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import gradio as gr
2
  from gradio.oauth import (
3
  OAUTH_CLIENT_ID,
@@ -65,3 +67,26 @@ def get_org_dropdown(token: OAuthToken = None):
65
  return gr.Dropdown(
66
  label="Organization", choices=orgs, value=orgs[0] if orgs else None
67
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+
3
  import gradio as gr
4
  from gradio.oauth import (
5
  OAUTH_CLIENT_ID,
 
67
  return gr.Dropdown(
68
  label="Organization", choices=orgs, value=orgs[0] if orgs else None
69
  )
70
+
71
+
72
+ def swap_visibilty(profile: Union[gr.OAuthProfile, None]):
73
+ if get_space():
74
+ if profile is None:
75
+ return gr.Column(visible=False)
76
+ else:
77
+ return gr.Column(visible=True)
78
+ else:
79
+ return gr.Column(visible=True)
80
+
81
+
82
+ def get_css():
83
+ css = """
84
+ h1{font-size: 2em}
85
+ h3{margin-top: 0}
86
+ #component-1{text-align:center}
87
+ .main_ui_logged_out{opacity: 0.3; pointer-events: none}
88
+ .tabitem{border: 0px}
89
+ .group_padding{padding: .55em}
90
+ #space_model .wrap > label:last-child{opacity: 0.3; pointer-events:none}
91
+ """
92
+ return css