Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

davidberenstein1957 HF staff commited on Dec 4, 2024

Commit

dd0124d

2 Parent(s): 9029def 3922cde

Merge branch 'main' of https://github.com/argilla-io/synthetic-data-generator

Browse files

Files changed (6) hide show

src/synthetic_dataset_generator/app.py +3 -3
src/synthetic_dataset_generator/apps/base.py +0 -44
src/synthetic_dataset_generator/apps/sft.py +13 -8
src/synthetic_dataset_generator/apps/textcat.py +10 -4
src/synthetic_dataset_generator/pipelines/sft.py +7 -7
src/synthetic_dataset_generator/pipelines/textcat.py +6 -5

src/synthetic_dataset_generator/app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from synthetic_dataset_generator._tabbedinterface import TabbedInterface
-from synthetic_dataset_generator.apps.eval import app as eval_app
 from synthetic_dataset_generator.apps.readme import app as readme_app
 from synthetic_dataset_generator.apps.sft import app as sft_app
 from synthetic_dataset_generator.apps.textcat import app as textcat_app
@@ -23,8 +23,8 @@ button[role="tab"][aria-selected="true"]:hover {border-color: var(--button-prima
 image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""
 demo = TabbedInterface(
-    [textcat_app, sft_app, eval_app, readme_app],
-    ["Text Classification", "Supervised Fine-Tuning", "Evaluation", "README"],
     css=css,
     title=image,
     head="Synthetic Data Generator",

 from synthetic_dataset_generator._tabbedinterface import TabbedInterface
+# from synthetic_dataset_generator.apps.eval import app as eval_app
 from synthetic_dataset_generator.apps.readme import app as readme_app
 from synthetic_dataset_generator.apps.sft import app as sft_app
 from synthetic_dataset_generator.apps.textcat import app as textcat_app
 image = """<br><img src="https://raw.githubusercontent.com/argilla-io/synthetic-data-generator/main/assets/logo.svg" alt="Synthetic Data Generator Logo" style="display: block; margin-left: auto; margin-right: auto; width: clamp(50%, 400px, 100%)"/>"""
 demo = TabbedInterface(
+    [textcat_app, sft_app, readme_app],
+    ["Text Classification", "Supervised Fine-Tuning", "README"],
     css=css,
     title=image,
     head="Synthetic Data Generator",

src/synthetic_dataset_generator/apps/base.py CHANGED Viewed

@@ -67,50 +67,6 @@ def push_pipeline_code_to_hub(
     progress(1.0, desc="Pipeline code uploaded")
-def push_dataset_to_hub(
-    dataframe: pd.DataFrame,
-    private: bool = True,
-    org_name: str = None,
-    repo_name: str = None,
-    oauth_token: Union[OAuthToken, None] = None,
-    progress=gr.Progress(),
-    labels: List[str] = None,
-    num_labels: int = None,
-    task: str = TEXTCAT_TASK,
-) -> pd.DataFrame:
-    progress(0.1, desc="Setting up dataset")
-    repo_id = validate_push_to_hub(org_name, repo_name)
-    if task == TEXTCAT_TASK:
-        if num_labels == 1:
-            dataframe["label"] = dataframe["label"].replace("", None)
-            features = Features(
-                {"text": Value("string"), "label": ClassLabel(names=labels)}
-            )
-        else:
-            features = Features(
-                {
-                    "text": Value("string"),
-                    "labels": Sequence(feature=ClassLabel(names=labels)),
-                }
-            )
-        distiset = Distiset(
-            {"default": Dataset.from_pandas(dataframe, features=features)}
-        )
-    else:
-        distiset = Distiset({"default": Dataset.from_pandas(dataframe)})
-    progress(0.2, desc="Pushing dataset to hub")
-    distiset.push_to_hub(
-        repo_id=repo_id,
-        private=private,
-        include_script=False,
-        token=oauth_token.token,
-        create_pr=False,
-    )
-    progress(1.0, desc="Dataset pushed to hub")
-    return dataframe
 def validate_push_to_hub(org_name, repo_name):
     repo_id = (
         f"{org_name}/{repo_name}"

     progress(1.0, desc="Pipeline code uploaded")
 def validate_push_to_hub(org_name, repo_name):
     repo_id = (
         f"{org_name}/{repo_name}"

src/synthetic_dataset_generator/apps/sft.py CHANGED Viewed

@@ -15,7 +15,7 @@ from synthetic_dataset_generator.apps.base import (
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
-from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE, SFT_AVAILABLE
 from synthetic_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,
@@ -49,10 +49,10 @@ def convert_dataframe_messages(dataframe: pd.DataFrame) -> pd.DataFrame:
     return dataframe
-def generate_system_prompt(dataset_description, temperature, progress=gr.Progress()):
     progress(0.0, desc="Generating system prompt")
     progress(0.3, desc="Initializing text generation")
-    generate_description = get_prompt_generator(temperature)
     progress(0.7, desc="Generating system prompt")
     result = next(
         generate_description.process(
@@ -92,12 +92,13 @@ def generate_dataset(
     system_prompt: str,
     num_turns: int = 1,
     num_rows: int = 10,
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating instructions")
-    magpie_generator = get_magpie_generator(system_prompt, num_turns, is_sample)
-    response_generator = get_response_generator(system_prompt, num_turns, is_sample)
     total_steps: int = num_rows * 2
     batch_size = DEFAULT_BATCH_SIZE
@@ -216,6 +217,7 @@ def push_dataset(
     num_turns: int = 1,
     num_rows: int = 10,
     private: bool = False,
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
@@ -223,6 +225,7 @@ def push_dataset(
         system_prompt=system_prompt,
         num_turns=num_turns,
         num_rows=num_rows,
     )
     push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
     try:
@@ -439,7 +442,7 @@ with gr.Blocks() as app:
                         label="Temperature",
                         minimum=0.1,
                         maximum=1,
-                        value=0.8,
                         step=0.1,
                         interactive=True,
                     )
@@ -463,6 +466,7 @@ with gr.Blocks() as app:
                             system_prompt=system_prompt.value,
                             num_turns=num_turns.value,
                             num_rows=num_rows.value,
                         )
                         pipeline_code = gr.Code(
                             value=code,
@@ -472,7 +476,7 @@ with gr.Blocks() as app:
         load_btn.click(
             fn=generate_system_prompt,
-            inputs=[dataset_description, temperature],
             outputs=[system_prompt],
             show_progress=True,
         ).then(
@@ -516,6 +520,7 @@ with gr.Blocks() as app:
                 num_turns,
                 num_rows,
                 private,
             ],
             outputs=[success_message],
             show_progress=True,
@@ -525,7 +530,7 @@ with gr.Blocks() as app:
             outputs=[success_message],
         ).success(
             fn=generate_pipeline_code,
-            inputs=[system_prompt, num_turns, num_rows],
             outputs=[pipeline_code],
         ).success(
             fn=show_pipeline_code_visibility,

     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
+from synthetic_dataset_generator.constants import DEFAULT_BATCH_SIZE, SFT_AVAILABLE, MODEL
 from synthetic_dataset_generator.pipelines.embeddings import (
     get_embeddings,
     get_sentence_embedding_dimensions,
     return dataframe
+def generate_system_prompt(dataset_description, progress=gr.Progress()):
     progress(0.0, desc="Generating system prompt")
     progress(0.3, desc="Initializing text generation")
+    generate_description = get_prompt_generator()
     progress(0.7, desc="Generating system prompt")
     result = next(
         generate_description.process(
     system_prompt: str,
     num_turns: int = 1,
     num_rows: int = 10,
+    temperature: float = 0.9,
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating instructions")
+    magpie_generator = get_magpie_generator(system_prompt, num_turns, temperature, is_sample)
+    response_generator = get_response_generator(system_prompt, num_turns, temperature, is_sample)
     total_steps: int = num_rows * 2
     batch_size = DEFAULT_BATCH_SIZE
     num_turns: int = 1,
     num_rows: int = 10,
     private: bool = False,
+    temperature: float = 0.9,
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
         system_prompt=system_prompt,
         num_turns=num_turns,
         num_rows=num_rows,
+        temperature=temperature,
     )
     push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
     try:
                         label="Temperature",
                         minimum=0.1,
                         maximum=1,
+                        value=0.9,
                         step=0.1,
                         interactive=True,
                     )
                             system_prompt=system_prompt.value,
                             num_turns=num_turns.value,
                             num_rows=num_rows.value,
+                            temperature=temperature.value,
                         )
                         pipeline_code = gr.Code(
                             value=code,
         load_btn.click(
             fn=generate_system_prompt,
+            inputs=[dataset_description],
             outputs=[system_prompt],
             show_progress=True,
         ).then(
                 num_turns,
                 num_rows,
                 private,
+                temperature
             ],
             outputs=[success_message],
             show_progress=True,
             outputs=[success_message],
         ).success(
             fn=generate_pipeline_code,
+            inputs=[system_prompt, num_turns, num_rows, temperature],
             outputs=[pipeline_code],
         ).success(
             fn=show_pipeline_code_visibility,

src/synthetic_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -45,10 +45,10 @@ def _get_dataframe():
     )
-def generate_system_prompt(dataset_description, temperature, progress=gr.Progress()):
     progress(0.0, desc="Generating text classification task")
     progress(0.3, desc="Initializing text generation")
-    generate_description = get_prompt_generator(temperature)
     progress(0.7, desc="Generating text classification task")
     result = next(
         generate_description.process(
@@ -89,13 +89,14 @@ def generate_dataset(
     labels: List[str] = None,
     num_labels: int = 1,
     num_rows: int = 10,
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating text classification data")
     labels = get_preprocess_labels(labels)
     textcat_generator = get_textcat_generator(
-        difficulty=difficulty, clarity=clarity, is_sample=is_sample
     )
     labeller_generator = get_labeller_generator(
         system_prompt=f"{system_prompt} {', '.join(labels)}",
@@ -204,6 +205,7 @@ def push_dataset(
     num_rows: int = 10,
     labels: List[str] = None,
     private: bool = False,
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
@@ -214,6 +216,7 @@ def push_dataset(
         num_labels=num_labels,
         labels=labels,
         num_rows=num_rows,
     )
     push_dataset_to_hub(
         dataframe, org_name, repo_name, num_labels, labels, oauth_token, private
@@ -471,6 +474,7 @@ with gr.Blocks() as app:
                         labels=labels.value,
                         num_labels=num_labels.value,
                         num_rows=num_rows.value,
                     )
                     pipeline_code = gr.Code(
                         value=code,
@@ -480,7 +484,7 @@ with gr.Blocks() as app:
     load_btn.click(
         fn=generate_system_prompt,
-        inputs=[dataset_description, temperature],
         outputs=[system_prompt, labels],
         show_progress=True,
     ).then(
@@ -537,6 +541,7 @@ with gr.Blocks() as app:
             num_rows,
             labels,
             private,
         ],
         outputs=[success_message],
         show_progress=True,
@@ -553,6 +558,7 @@ with gr.Blocks() as app:
             labels,
             num_labels,
             num_rows,
         ],
         outputs=[pipeline_code],
     ).success(

     )
+def generate_system_prompt(dataset_description, progress=gr.Progress()):
     progress(0.0, desc="Generating text classification task")
     progress(0.3, desc="Initializing text generation")
+    generate_description = get_prompt_generator()
     progress(0.7, desc="Generating text classification task")
     result = next(
         generate_description.process(
     labels: List[str] = None,
     num_labels: int = 1,
     num_rows: int = 10,
+    temperature: float = 0.9,
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating text classification data")
     labels = get_preprocess_labels(labels)
     textcat_generator = get_textcat_generator(
+        difficulty=difficulty, clarity=clarity, temperature=temperature, is_sample=is_sample
     )
     labeller_generator = get_labeller_generator(
         system_prompt=f"{system_prompt} {', '.join(labels)}",
     num_rows: int = 10,
     labels: List[str] = None,
     private: bool = False,
+    temperature: float = 0.8,
     oauth_token: Union[gr.OAuthToken, None] = None,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
         num_labels=num_labels,
         labels=labels,
         num_rows=num_rows,
+        temperature=temperature,
     )
     push_dataset_to_hub(
         dataframe, org_name, repo_name, num_labels, labels, oauth_token, private
                         labels=labels.value,
                         num_labels=num_labels.value,
                         num_rows=num_rows.value,
+                        temperature=temperature.value,
                     )
                     pipeline_code = gr.Code(
                         value=code,
     load_btn.click(
         fn=generate_system_prompt,
+        inputs=[dataset_description],
         outputs=[system_prompt, labels],
         show_progress=True,
     ).then(
             num_rows,
             labels,
             private,
+            temperature
         ],
         outputs=[success_message],
         show_progress=True,
             labels,
             num_labels,
             num_rows,
+            temperature
         ],
         outputs=[pipeline_code],
     ).success(

src/synthetic_dataset_generator/pipelines/sft.py CHANGED Viewed

@@ -140,7 +140,7 @@ def _get_output_mappings(num_turns):
         return {"conversation": "messages"}
-def get_prompt_generator(temperature):
     prompt_generator = TextGeneration(
         llm=InferenceEndpointsLLM(
             api_key=_get_next_api_key(),
@@ -148,7 +148,7 @@ def get_prompt_generator(temperature):
             tokenizer_id=MODEL,
             base_url=BASE_URL,
             generation_kwargs={
-                "temperature": temperature,
                 "max_new_tokens": 2048,
                 "do_sample": True,
             },
@@ -160,7 +160,7 @@ def get_prompt_generator(temperature):
     return prompt_generator
-def get_magpie_generator(system_prompt, num_turns, is_sample):
     input_mappings = _get_output_mappings(num_turns)
     output_mappings = input_mappings.copy()
     if num_turns == 1:
@@ -172,7 +172,7 @@ def get_magpie_generator(system_prompt, num_turns, is_sample):
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
                 generation_kwargs={
-                    "temperature": 0.9,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 512,
                     "stop_sequences": _STOP_SEQUENCES,
@@ -192,7 +192,7 @@ def get_magpie_generator(system_prompt, num_turns, is_sample):
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
                 generation_kwargs={
-                    "temperature": 0.9,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 1024,
                     "stop_sequences": _STOP_SEQUENCES,
@@ -243,7 +243,7 @@ def get_response_generator(system_prompt, num_turns, is_sample):
     return response_generator
-def generate_pipeline_code(system_prompt, num_turns, num_rows):
     input_mappings = _get_output_mappings(num_turns)
     code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
@@ -266,7 +266,7 @@ with Pipeline(name="sft") as pipeline:
             base_url=BASE_URL,
             magpie_pre_query_template="llama3",
             generation_kwargs={{
-                "temperature": 0.9,
                 "do_sample": True,
                 "max_new_tokens": 2048,
                 "stop_sequences": {_STOP_SEQUENCES}

         return {"conversation": "messages"}
+def get_prompt_generator():
     prompt_generator = TextGeneration(
         llm=InferenceEndpointsLLM(
             api_key=_get_next_api_key(),
             tokenizer_id=MODEL,
             base_url=BASE_URL,
             generation_kwargs={
+                "temperature": 0.8,
                 "max_new_tokens": 2048,
                 "do_sample": True,
             },
     return prompt_generator
+def get_magpie_generator(system_prompt, num_turns, temperature, is_sample):
     input_mappings = _get_output_mappings(num_turns)
     output_mappings = input_mappings.copy()
     if num_turns == 1:
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
                 generation_kwargs={
+                    "temperature": temperature,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 512,
                     "stop_sequences": _STOP_SEQUENCES,
                 api_key=_get_next_api_key(),
                 magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
                 generation_kwargs={
+                    "temperature": temperature,
                     "do_sample": True,
                     "max_new_tokens": 256 if is_sample else 1024,
                     "stop_sequences": _STOP_SEQUENCES,
     return response_generator
+def generate_pipeline_code(system_prompt, num_turns, num_rows, temperature):
     input_mappings = _get_output_mappings(num_turns)
     code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
             base_url=BASE_URL,
             magpie_pre_query_template="llama3",
             generation_kwargs={{
+                "temperature": {temperature},
                 "do_sample": True,
                 "max_new_tokens": 2048,
                 "stop_sequences": {_STOP_SEQUENCES}

src/synthetic_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -66,7 +66,7 @@ class TextClassificationTask(BaseModel):
     )
-def get_prompt_generator(temperature):
     prompt_generator = TextGeneration(
         llm=InferenceEndpointsLLM(
             api_key=_get_next_api_key(),
@@ -74,7 +74,7 @@ def get_prompt_generator(temperature):
             base_url=BASE_URL,
             structured_output={"format": "json", "schema": TextClassificationTask},
             generation_kwargs={
-                "temperature": temperature,
                 "max_new_tokens": 2048,
                 "do_sample": True,
             },
@@ -86,14 +86,14 @@ def get_prompt_generator(temperature):
     return prompt_generator
-def get_textcat_generator(difficulty, clarity, is_sample):
     textcat_generator = GenerateTextClassificationData(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
             base_url=BASE_URL,
             api_key=_get_next_api_key(),
             generation_kwargs={
-                "temperature": 0.9,
                 "max_new_tokens": 256 if is_sample else 2048,
                 "do_sample": True,
                 "top_k": 50,
@@ -135,6 +135,7 @@ def generate_pipeline_code(
     labels: List[str] = None,
     num_labels: int = 1,
     num_rows: int = 10,
 ) -> str:
     labels = get_preprocess_labels(labels)
     base_code = f"""
@@ -163,7 +164,7 @@ with Pipeline(name="textcat") as pipeline:
             base_url=BASE_URL,
             api_key=os.environ["API_KEY"],
             generation_kwargs={{
-                "temperature": 0.8,
                 "max_new_tokens": 2048,
                 "do_sample": True,
                 "top_k": 50,

     )
+def get_prompt_generator():
     prompt_generator = TextGeneration(
         llm=InferenceEndpointsLLM(
             api_key=_get_next_api_key(),
             base_url=BASE_URL,
             structured_output={"format": "json", "schema": TextClassificationTask},
             generation_kwargs={
+                "temperature": 0.8,
                 "max_new_tokens": 2048,
                 "do_sample": True,
             },
     return prompt_generator
+def get_textcat_generator(difficulty, clarity, temperature, is_sample):
     textcat_generator = GenerateTextClassificationData(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
             base_url=BASE_URL,
             api_key=_get_next_api_key(),
             generation_kwargs={
+                "temperature": temperature,
                 "max_new_tokens": 256 if is_sample else 2048,
                 "do_sample": True,
                 "top_k": 50,
     labels: List[str] = None,
     num_labels: int = 1,
     num_rows: int = 10,
+    temperature: float = 0.9,
 ) -> str:
     labels = get_preprocess_labels(labels)
     base_code = f"""
             base_url=BASE_URL,
             api_key=os.environ["API_KEY"],
             generation_kwargs={{
+                "temperature": {temperature},
                 "max_new_tokens": 2048,
                 "do_sample": True,
                 "top_k": 50,