data-generator

Runtime error

App Files Files

davidberenstein1957 HF staff commited on Dec 11, 2024

Commit

10b52aa

unverified ·

2 Parent(s): 7b7c1be 0c1d5b6

Merge pull request #8 from argilla-io/feat/add-multi-label

Browse files

Files changed (2) hide show

src/synthetic_dataset_generator/apps/textcat.py +51 -62
src/synthetic_dataset_generator/pipelines/textcat.py +3 -3

src/synthetic_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -69,14 +69,14 @@ def generate_system_prompt(dataset_description, progress=gr.Progress()):
 def generate_sample_dataset(
-    system_prompt, difficulty, clarity, labels, num_labels, progress=gr.Progress()
 ):
     dataframe = generate_dataset(
         system_prompt=system_prompt,
         difficulty=difficulty,
         clarity=clarity,
         labels=labels,
-        num_labels=num_labels,
         num_rows=10,
         progress=progress,
         is_sample=True,
@@ -89,7 +89,7 @@ def generate_dataset(
     difficulty: str,
     clarity: str,
     labels: List[str] = None,
-    num_labels: int = 1,
     num_rows: int = 10,
     temperature: float = 0.9,
     is_sample: bool = False,
@@ -105,9 +105,9 @@ def generate_dataset(
         is_sample=is_sample,
     )
     labeller_generator = get_labeller_generator(
-        system_prompt=f"{system_prompt} {', '.join(labels)}",
         labels=labels,
-        num_labels=num_labels,
     )
     total_steps: int = num_rows * 2
     batch_size = DEFAULT_BATCH_SIZE
@@ -125,11 +125,16 @@ def generate_dataset(
         batch_size = min(batch_size, remaining_rows)
         inputs = []
         for _ in range(batch_size):
-            if num_labels == 1:
-                num_labels = 1
             else:
-                num_labels = int(random.gammavariate(2, 2) * num_labels)
-            sampled_labels = random.sample(labels, num_labels)
             random.shuffle(sampled_labels)
             inputs.append(
                 {
@@ -169,12 +174,7 @@ def generate_dataset(
         distiset_results.append(record)
     dataframe = pd.DataFrame(distiset_results)
-    if num_labels == 1:
-        dataframe = dataframe.rename(columns={"labels": "label"})
-        dataframe["label"] = dataframe["label"].apply(
-            lambda x: x.lower().strip() if x.lower().strip() in labels else None
-        )
-    else:
         dataframe["labels"] = dataframe["labels"].apply(
             lambda x: list(
                 set(
@@ -186,6 +186,12 @@ def generate_dataset(
                 )
             )
         )
     progress(1.0, desc="Dataset created")
     return dataframe
@@ -194,7 +200,7 @@ def push_dataset_to_hub(
     dataframe: pd.DataFrame,
     org_name: str,
     repo_name: str,
-    num_labels: int = 1,
     labels: List[str] = None,
     oauth_token: Union[gr.OAuthToken, None] = None,
     private: bool = False,
@@ -206,18 +212,17 @@ def push_dataset_to_hub(
     progress(0.3, desc="Preprocessing")
     labels = get_preprocess_labels(labels)
     progress(0.7, desc="Creating dataset")
-    if num_labels == 1:
-        dataframe["label"] = dataframe["label"].replace("", None)
-        features = Features(
-            {"text": Value("string"), "label": ClassLabel(names=labels)}
-        )
-    else:
         features = Features(
             {
                 "text": Value("string"),
                 "labels": Sequence(feature=ClassLabel(names=labels)),
             }
         )
     dataset = Dataset.from_pandas(dataframe, features=features)
     dataset = combine_datasets(repo_id, dataset)
     distiset = Distiset({"default": dataset})
@@ -239,7 +244,7 @@ def push_dataset(
     system_prompt: str,
     difficulty: str,
     clarity: str,
-    num_labels: int = 1,
     num_rows: int = 10,
     labels: List[str] = None,
     private: bool = False,
@@ -252,7 +257,7 @@ def push_dataset(
         system_prompt=system_prompt,
         difficulty=difficulty,
         clarity=clarity,
-        num_labels=num_labels,
         labels=labels,
         num_rows=num_rows,
         temperature=temperature,
@@ -261,7 +266,7 @@ def push_dataset(
         dataframe,
         org_name,
         repo_name,
-        num_labels,
         labels,
         oauth_token,
         private,
@@ -288,19 +293,19 @@ def push_dataset(
             ],
             questions=[
                 (
-                    rg.LabelQuestion(
-                        name="label",
-                        title="Label",
-                        description="The label of the text",
-                        labels=labels,
-                    )
-                    if num_labels == 1
-                    else rg.MultiLabelQuestion(
                         name="labels",
                         title="Labels",
                         description="The labels of the conversation",
                         labels=labels,
                     )
                 ),
             ],
             metadata=[
@@ -340,16 +345,16 @@ def push_dataset(
                 suggestions=(
                     [
                         rg.Suggestion(
-                            question_name="label" if num_labels == 1 else "labels",
                             value=(
-                                sample["label"] if num_labels == 1 else sample["labels"]
                             ),
                         )
                     ]
                     if (
-                        (num_labels == 1 and sample["label"] in labels)
                         or (
-                            num_labels > 1
                             and all(label in labels for label in sample["labels"])
                         )
                     )
@@ -373,10 +378,6 @@ def validate_input_labels(labels):
     return labels
-def update_max_num_labels(labels):
-    return gr.update(maximum=len(labels) if labels else 1)
 def show_pipeline_code_visibility():
     return {pipeline_code_ui: gr.Accordion(visible=True)}
@@ -434,13 +435,11 @@ with gr.Blocks() as app:
                         multiselect=True,
                         info="Add the labels to classify the text.",
                     )
-                    num_labels = gr.Number(
-                        label="Number of labels per text",
-                        value=1,
-                        minimum=1,
-                        maximum=10,
-                        info="Select 1 for single-label and >1 for multi-label.",
                         interactive=True,
                     )
                     clarity = gr.Dropdown(
                         choices=[
@@ -521,7 +520,7 @@ with gr.Blocks() as app:
                         difficulty=difficulty.value,
                         clarity=clarity.value,
                         labels=labels.value,
-                        num_labels=num_labels.value,
                         num_rows=num_rows.value,
                         temperature=temperature.value,
                     )
@@ -538,24 +537,14 @@ with gr.Blocks() as app:
         show_progress=True,
     ).then(
         fn=generate_sample_dataset,
-        inputs=[system_prompt, difficulty, clarity, labels, num_labels],
         outputs=[dataframe],
         show_progress=True,
-    ).then(
-        fn=update_max_num_labels,
-        inputs=[labels],
-        outputs=[num_labels],
-    )
-    labels.input(
-        fn=update_max_num_labels,
-        inputs=[labels],
-        outputs=[num_labels],
     )
     btn_apply_to_sample_dataset.click(
         fn=generate_sample_dataset,
-        inputs=[system_prompt, difficulty, clarity, labels, num_labels],
         outputs=[dataframe],
         show_progress=True,
     )
@@ -586,7 +575,7 @@ with gr.Blocks() as app:
             system_prompt,
             difficulty,
             clarity,
-            num_labels,
             num_rows,
             labels,
             private,
@@ -606,7 +595,7 @@ with gr.Blocks() as app:
             difficulty,
             clarity,
             labels,
-            num_labels,
             num_rows,
             temperature,
         ],

 def generate_sample_dataset(
+    system_prompt, difficulty, clarity, labels, multi_label, progress=gr.Progress()
 ):
     dataframe = generate_dataset(
         system_prompt=system_prompt,
         difficulty=difficulty,
         clarity=clarity,
         labels=labels,
+        multi_label=multi_label,
         num_rows=10,
         progress=progress,
         is_sample=True,
     difficulty: str,
     clarity: str,
     labels: List[str] = None,
+    multi_label: bool = False,
     num_rows: int = 10,
     temperature: float = 0.9,
     is_sample: bool = False,
         is_sample=is_sample,
     )
     labeller_generator = get_labeller_generator(
+        system_prompt=f"{system_prompt}. Potential labels: {', '.join(labels)}",
         labels=labels,
+        multi_label=multi_label,
     )
     total_steps: int = num_rows * 2
     batch_size = DEFAULT_BATCH_SIZE
         batch_size = min(batch_size, remaining_rows)
         inputs = []
         for _ in range(batch_size):
+            if multi_label:
+                num_labels = len(labels)
+                k = int(
+                    random.betavariate(alpha=(num_labels - 1), beta=num_labels)
+                    * num_labels
+                )
             else:
+                k = 1
+            sampled_labels = random.sample(labels, min(k, len(labels)))
             random.shuffle(sampled_labels)
             inputs.append(
                 {
         distiset_results.append(record)
     dataframe = pd.DataFrame(distiset_results)
+    if multi_label:
         dataframe["labels"] = dataframe["labels"].apply(
             lambda x: list(
                 set(
                 )
             )
         )
+    else:
+        dataframe = dataframe.rename(columns={"labels": "label"})
+        dataframe["label"] = dataframe["label"].apply(
+            lambda x: x.lower().strip() if x.lower().strip() in labels else None
+        )
     progress(1.0, desc="Dataset created")
     return dataframe
     dataframe: pd.DataFrame,
     org_name: str,
     repo_name: str,
+    multi_label: bool = False,
     labels: List[str] = None,
     oauth_token: Union[gr.OAuthToken, None] = None,
     private: bool = False,
     progress(0.3, desc="Preprocessing")
     labels = get_preprocess_labels(labels)
     progress(0.7, desc="Creating dataset")
+    if multi_label:
         features = Features(
             {
                 "text": Value("string"),
                 "labels": Sequence(feature=ClassLabel(names=labels)),
             }
         )
+    else:
+        features = Features(
+            {"text": Value("string"), "label": ClassLabel(names=labels)}
+        )
     dataset = Dataset.from_pandas(dataframe, features=features)
     dataset = combine_datasets(repo_id, dataset)
     distiset = Distiset({"default": dataset})
     system_prompt: str,
     difficulty: str,
     clarity: str,
+    multi_label: int = 1,
     num_rows: int = 10,
     labels: List[str] = None,
     private: bool = False,
         system_prompt=system_prompt,
         difficulty=difficulty,
         clarity=clarity,
+        multi_label=multi_label,
         labels=labels,
         num_rows=num_rows,
         temperature=temperature,
         dataframe,
         org_name,
         repo_name,
+        multi_label,
         labels,
         oauth_token,
         private,
             ],
             questions=[
                 (
+                    rg.MultiLabelQuestion(
                         name="labels",
                         title="Labels",
                         description="The labels of the conversation",
                         labels=labels,
                     )
+                    if multi_label
+                    else rg.LabelQuestion(
+                        name="label",
+                        title="Label",
+                        description="The label of the text",
+                        labels=labels,
+                    )
                 ),
             ],
             metadata=[
                 suggestions=(
                     [
                         rg.Suggestion(
+                            question_name="labels" if multi_label else "label",
                             value=(
+                                sample["labels"] if multi_label else sample["label"]
                             ),
                         )
                     ]
                     if (
+                        (not multi_label and sample["label"] in labels)
                         or (
+                            multi_label
                             and all(label in labels for label in sample["labels"])
                         )
                     )
     return labels
 def show_pipeline_code_visibility():
     return {pipeline_code_ui: gr.Accordion(visible=True)}
                         multiselect=True,
                         info="Add the labels to classify the text.",
                     )
+                    multi_label = gr.Checkbox(
+                        label="Multi-label",
+                        value=False,
                         interactive=True,
+                        info="If checked, the text will be classified into multiple labels.",
                     )
                     clarity = gr.Dropdown(
                         choices=[
                         difficulty=difficulty.value,
                         clarity=clarity.value,
                         labels=labels.value,
+                        num_labels=len(labels.value) if multi_label.value else 1,
                         num_rows=num_rows.value,
                         temperature=temperature.value,
                     )
         show_progress=True,
     ).then(
         fn=generate_sample_dataset,
+        inputs=[system_prompt, difficulty, clarity, labels, multi_label],
         outputs=[dataframe],
         show_progress=True,
     )
     btn_apply_to_sample_dataset.click(
         fn=generate_sample_dataset,
+        inputs=[system_prompt, difficulty, clarity, labels, multi_label],
         outputs=[dataframe],
         show_progress=True,
     )
             system_prompt,
             difficulty,
             clarity,
+            multi_label,
             num_rows,
             labels,
             private,
             difficulty,
             clarity,
             labels,
+            multi_label,
             num_rows,
             temperature,
         ],

src/synthetic_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -29,7 +29,7 @@ Description: DavidMovieHouse is a cinema that has been in business for 10 years.
 Output: {"classification_task": "The company DavidMovieHouse is a cinema that has been in business for 10 years and has had customers reviews of varying customer groups. Classify the customer reviews as", "labels": ["positive", "negative"]}
 Description: A dataset that focuses on creating neo-ludite discussions about technologies within the AI space.
-Output: {"classification_task": "Neo-ludiite discussions about technologies within the AI space cover from different speaking people    . Categorize the discussions into one of the following categories", "labels": ["tech-support", "tech-opposition"]}
 Description: A dataset that covers the articles of a niche sports website called TheSportBlogs that focuses on female sports within the ballsport domain for the US market.
 Output: {"classification_task": "TechSportBlogs is a niche sports website that focuses on female sports within the ballsport domain for the US market. Written by different journalists. Determine the category of based on the article using the following categories", "labels": ["basketball", "volleyball", "tennis", "hockey", "baseball", "soccer"]}
@@ -102,7 +102,7 @@ def get_textcat_generator(difficulty, clarity, temperature, is_sample):
     return textcat_generator
-def get_labeller_generator(system_prompt, labels, num_labels):
     labeller_generator = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
@@ -115,7 +115,7 @@ def get_labeller_generator(system_prompt, labels, num_labels):
         ),
         context=system_prompt,
         available_labels=labels,
-        n=num_labels,
         default_label="unknown",
     )
     labeller_generator.load()

 Output: {"classification_task": "The company DavidMovieHouse is a cinema that has been in business for 10 years and has had customers reviews of varying customer groups. Classify the customer reviews as", "labels": ["positive", "negative"]}
 Description: A dataset that focuses on creating neo-ludite discussions about technologies within the AI space.
+Output: {"classification_task": "Neo-ludiite discussions about technologies within the AI space cover from different speaking people. Categorize the discussions into one of the following categories", "labels": ["tech-support", "tech-opposition"]}
 Description: A dataset that covers the articles of a niche sports website called TheSportBlogs that focuses on female sports within the ballsport domain for the US market.
 Output: {"classification_task": "TechSportBlogs is a niche sports website that focuses on female sports within the ballsport domain for the US market. Written by different journalists. Determine the category of based on the article using the following categories", "labels": ["basketball", "volleyball", "tennis", "hockey", "baseball", "soccer"]}
     return textcat_generator
+def get_labeller_generator(system_prompt, labels, multi_label):
     labeller_generator = TextClassification(
         llm=InferenceEndpointsLLM(
             model_id=MODEL,
         ),
         context=system_prompt,
         available_labels=labels,
+        n=len(labels) if multi_label else 1,
         default_label="unknown",
     )
     labeller_generator.load()