Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

sdiazlor HF staff commited on Dec 3, 2024

Commit

34371d3

1 Parent(s): 49d5948

add evaluation task

Browse files

Files changed (7) hide show

app.py +3 -2
pyproject.toml +1 -1
src/distilabel_dataset_generator/apps/eval.py +672 -199
src/distilabel_dataset_generator/apps/sft.py +4 -0
src/distilabel_dataset_generator/apps/textcat.py +11 -0
src/distilabel_dataset_generator/pipelines/eval.py +205 -0
src/distilabel_dataset_generator/utils.py +90 -1

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gradio as gr
 from src.distilabel_dataset_generator._tabbedinterface import TabbedInterface
 from src.distilabel_dataset_generator.apps.faq import app as faq_app
 from src.distilabel_dataset_generator.apps.sft import app as sft_app
 from src.distilabel_dataset_generator.apps.textcat import app as textcat_app
 theme = gr.themes.Monochrome(
@@ -54,8 +55,8 @@ button[role="tab"][data-tab-id][aria-selected="true"] {
 """
 demo = TabbedInterface(
-    [textcat_app, sft_app, faq_app],
-    ["Text Classification", "Supervised Fine-Tuning", "FAQ"],
     css=css,
     title="""
     <h1>Synthetic Data Generator</h1>

 from src.distilabel_dataset_generator._tabbedinterface import TabbedInterface
 from src.distilabel_dataset_generator.apps.faq import app as faq_app
 from src.distilabel_dataset_generator.apps.sft import app as sft_app
+from src.distilabel_dataset_generator.apps.eval import app as eval_app
 from src.distilabel_dataset_generator.apps.textcat import app as textcat_app
 theme = gr.themes.Monochrome(
 """
 demo = TabbedInterface(
+    [textcat_app, sft_app, eval_app, faq_app],
+    ["Text Classification", "Supervised Fine-Tuning", "Evaluation", "FAQ"],
     css=css,
     title="""
     <h1>Synthetic Data Generator</h1>

pyproject.toml CHANGED Viewed

@@ -6,7 +6,7 @@ authors = [
     {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"},
 ]
 dependencies = [
-    "distilabel[hf-inference-endpoints,argilla,outlines]>=1.4.1",
     "gradio[oauth]<5.0.0",
     "transformers>=4.44.2",
     "sentence-transformers>=3.2.0",

     {name = "davidberenstein1957", email = "david.m.berenstein@gmail.com"},
 ]
 dependencies = [
+    "distilabel[hf-inference-endpoints,argilla,outlines,instructor]>=1.4.1",
     "gradio[oauth]<5.0.0",
     "transformers>=4.44.2",
     "sentence-transformers>=3.2.0",

src/distilabel_dataset_generator/apps/eval.py CHANGED Viewed

@@ -1,70 +1,106 @@
 import json
 import gradio as gr
 import pandas as pd
-from datasets import load_dataset
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
-from src.distilabel_dataset_generator.utils import get_org_dropdown
-def get_iframe(hub_repo_id) -> str:
     if not hub_repo_id:
-        raise gr.Error("Hub repo id is required")
     url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer"
     iframe = f"""
     <iframe
-  src="{url}"
-  frameborder="0"
-  width="100%"
-  height="600px"
-></iframe>
-"""
     return iframe
-def get_valid_columns(df: pd.DataFrame):
-    valid_columns = []
-    for col in df.columns:
-        sample_val = df[col].iloc[0]
         if isinstance(sample_val, str) or (
-            isinstance(sample_val, list)
-            and all(isinstance(item, dict) for item in sample_val)
         ):
-            valid_columns.append(col)
-    return valid_columns
-def load_dataset_from_hub(hub_repo_id: str, num_rows: int = 10):
-    gr.Info(message="Loading dataset ...")
-    if not hub_repo_id:
         raise gr.Error("Hub repo id is required")
-    ds_dict = load_dataset(hub_repo_id)
-    splits = list(ds_dict.keys())
     ds = ds_dict[splits[0]]
     if num_rows:
         ds = ds.select(range(num_rows))
-    df = ds.to_pandas()
-    # Get columns that contain either strings or lists of dictionaries
-    valid_columns = get_valid_columns(df)
     return (
-        df,
-        gr.Dropdown(choices=valid_columns, label="Instruction Column"),
-        gr.Dropdown(choices=valid_columns, label="Instruction Column"),
-        gr.Dropdown(choices=valid_columns, label="Response Column"),
     )
 def define_evaluation_aspects(task_type: str):
-    if task_type == "instruction":
-        return gr.Dropdown(
-            value=["overall-rating"],
-            choices=["complexity", "quality"],
-            label="Evaluation Aspects",
-            multiselect=True,
-            interactive=True,
-        )
-    elif task_type == "instruction-response":
         return gr.Dropdown(
             value=["overall-rating"],
             choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
@@ -76,106 +112,473 @@ def define_evaluation_aspects(task_type: str):
         return gr.Dropdown(interactive=False, visible=False)
-def evaluate_instruction(df: pd.DataFrame, aspects: list[str], instruction_column: str):
-    pass
 def evaluate_instruction_response(
-    df: pd.DataFrame, aspects: list[str], instruction_column: str, response_column: str
 ):
-    pass
 def evaluate_custom(
-    df: pd.DataFrame, aspects: list[str], prompt_template: str, structured_output: dict
 ):
-    pass
-def _apply_to_dataset(
-    df: pd.DataFrame,
     eval_type: str,
-    aspects_instruction: list[str],
-    instruction_column: str,
     aspects_instruction_response: list[str],
-    instruction_column_response: str,
-    response_column_response: str,
-    aspects_custom: list[str],
     prompt_template: str,
     structured_output: dict,
 ):
-    if eval_type == "instruction":
-        df = evaluate_instruction(df, aspects_instruction, instruction_column)
-    elif eval_type == "instruction-response":
-        df = evaluate_instruction_response(
-            df,
-            aspects_instruction_response,
-            instruction_column_response,
-            response_column_response,
         )
-    elif eval_type == "custom":
-        df = evaluate_custom(df, aspects_custom, prompt_template, structured_output)
-    return df
-def apply_to_sample_dataset(
     repo_id: str,
     eval_type: str,
-    aspects_instruction: list[str],
     aspects_instruction_response: list[str],
-    aspects_custom: list[str],
-    instruction_instruction: str,
     instruction_instruction_response: str,
     response_instruction_response: str,
     prompt_template: str,
     structured_output: dict,
 ):
-    df, _, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
-    df = _apply_to_dataset(
-        df,
-        eval_type,
-        aspects_instruction,
-        instruction_instruction,
-        aspects_instruction_response,
-        instruction_instruction_response,
-        response_instruction_response,
-        aspects_custom,
-        prompt_template,
-        structured_output,
     )
-    return df
-def push_to_hub(
     org_name: str,
     repo_name: str,
     private: bool,
     num_rows: int,
     original_repo_id: str,
     eval_type: str,
-    aspects_instruction: list[str],
     aspects_instruction_response: list[str],
-    aspects_custom: list[str],
-    instruction_instruction: str,
     instruction_instruction_response: str,
     response_instruction_response: str,
     prompt_template: str,
     structured_output: dict,
-):
-    df, _, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
-    df = _apply_to_dataset(
-        df,
-        eval_type,
-        aspects_instruction,
-        instruction_instruction,
-        aspects_instruction_response,
-        instruction_instruction_response,
-        response_instruction_response,
-        aspects_custom,
-        prompt_template,
-        structured_output,
     )
-    new_repo_id = f"{org_name}/{repo_name}"
 ######################
@@ -184,123 +587,157 @@ def push_to_hub(
 with gr.Blocks() as app:
-    gr.Markdown("## 1. Select your input dataset")
-    with gr.Row(equal_height=False):
-        with gr.Column(scale=1):
-            search_in = HuggingfaceHubSearch(
-                label="Search",
-                placeholder="Search for a Dataset",
-                search_type="dataset",
-                sumbit_on_select=True,
-            )
-            load_btn = gr.Button("Load dataset")
-        with gr.Column(scale=3):
-            search_out = gr.HTML(label="Dataset Preview")
-    gr.HTML("<hr>")
-    gr.Markdown("## 2. Configure your task")
-    with gr.Row(equal_height=False):
-        with gr.Column(scale=1):
-            eval_type = gr.Dropdown(
-                label="Evaluation Type",
-                choices=["instruction", "instruction-response", "custom-template"],
-                visible=False,
-            )
-            with gr.Tab("instruction") as tab_instruction:
-                aspects_instruction = define_evaluation_aspects("instruction")
-                instruction_instruction = gr.Dropdown(
-                    label="Instruction Column", interactive=True
                 )
-                tab_instruction.select(
-                    lambda: "instruction",
-                    inputs=[],
-                    outputs=[eval_type],
-                )
-            with gr.Tab("instruction-response") as tab_instruction_response:
-                aspects_instruction_response = define_evaluation_aspects(
-                    "instruction-response"
-                )
-                instruction_instruction_response = gr.Dropdown(
-                    label="Instruction Column", interactive=True
                 )
-                response_instruction_response = gr.Dropdown(
-                    label="Response Column", interactive=True
                 )
-                tab_instruction_response.select(
-                    lambda: "instruction-response",
-                    inputs=[],
-                    outputs=[eval_type],
                 )
-            with gr.Tab("custom") as tab_custom:
-                aspects_custom = define_evaluation_aspects("custom")
-                prompt_template = gr.Code(
-                    label="Prompt Template",
-                    value="{{column_1}} based on {{column_2}}",
-                    language="markdown",
                     interactive=True,
                 )
-                structured_output = gr.Code(
-                    label="Structured Output",
-                    value=json.dumps({"eval_aspect": "str"}),
-                    language="json",
                     interactive=True,
                 )
-                tab_custom.select(
-                    lambda: "custom-template",
-                    inputs=[],
-                    outputs=[eval_type],
                 )
-            btn_apply_to_sample_dataset = gr.Button("Refresh dataset")
-        with gr.Column(scale=3):
-            dataframe = gr.Dataframe(wrap=True, height=300)
-    gr.HTML("<hr>")
-    gr.Markdown("## 3. Generate your dataset")
-    with gr.Row():
-        with gr.Column(scale=2):
-            org_name = get_org_dropdown()
-            repo_name = gr.Textbox(
-                label="Repo name",
-                placeholder="dataset_name",
-                value="my-distiset",
-                interactive=True,
-            )
-            num_rows = gr.Number(
-                label="Number of rows",
-                value=10,
-                interactive=True,
-                scale=1,
-            )
-            private = gr.Checkbox(
-                label="Private dataset",
-                value=False,
-                interactive=True,
-                scale=1,
-            )
-            btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
-        with gr.Column(scale=3):
-            success_message = gr.Markdown(visible=False)
-    search_in.submit(get_iframe, inputs=search_in, outputs=search_out)
     load_btn.click(
-        load_dataset_from_hub,
         inputs=[search_in],
         outputs=[
             dataframe,
-            instruction_instruction,
             instruction_instruction_response,
             response_instruction_response,
         ],
     )
     btn_apply_to_sample_dataset.click(
-        apply_to_sample_dataset,
         inputs=[
             search_in,
             eval_type,
-            aspects_instruction,
             aspects_instruction_response,
-            aspects_custom,
-            instruction_instruction,
             instruction_instruction_response,
             response_instruction_response,
             prompt_template,
@@ -308,8 +745,23 @@ with gr.Blocks() as app:
         ],
         outputs=dataframe,
     )
     btn_push_to_hub.click(
-        push_to_hub,
         inputs=[
             org_name,
             repo_name,
@@ -317,15 +769,36 @@ with gr.Blocks() as app:
             num_rows,
             search_in,
             eval_type,
-            aspects_instruction,
             aspects_instruction_response,
-            aspects_custom,
-            instruction_instruction,
             instruction_instruction_response,
             response_instruction_response,
             prompt_template,
             structured_output,
         ],
-        outputs=success_message,
     )
     app.load(fn=get_org_dropdown, outputs=[org_name])

 import json
+import uuid
+from typing import Union
+import argilla as rg
 import gradio as gr
+import numpy as np
 import pandas as pd
+from datasets import (
+    Dataset,
+    get_dataset_config_names,
+    get_dataset_split_names,
+    load_dataset,
+)
+from distilabel.distiset import Distiset
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
+from huggingface_hub import HfApi
+from src.distilabel_dataset_generator.apps.base import (
+    hide_success_message,
+    show_success_message,
+    validate_argilla_user_workspace_dataset,
+    validate_push_to_hub,
+)
+from src.distilabel_dataset_generator.pipelines.base import (
+    DEFAULT_BATCH_SIZE,
+)
+from src.distilabel_dataset_generator.pipelines.embeddings import (
+    get_embeddings,
+    get_sentence_embedding_dimensions,
+)
+from src.distilabel_dataset_generator.pipelines.eval import (
+    generate_pipeline_code,
+    get_custom_evaluator,
+    get_ultrafeedback_evaluator,
+)
+from src.distilabel_dataset_generator.utils import (
+    column_to_list,
+    extract_column_names,
+    get_argilla_client,
+    get_org_dropdown,
+    process_columns,
+    swap_visibility,
+    pad_or_truncate_list,
+)
+def get_iframe(hub_repo_id: str) -> str:
     if not hub_repo_id:
+        raise gr.Error("Hub repository ID is required.")
     url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer"
     iframe = f"""
     <iframe
+        src="{url}"
+        frameborder="0"
+        width="100%"
+        height="600px"
+    ></iframe>
+    """
     return iframe
+def get_valid_columns(dataframe: pd.DataFrame):
+    instruction_valid_columns = []
+    response_valid_columns = []
+    for col in dataframe.columns:
+        sample_val = dataframe[col].iloc[0]
         if isinstance(sample_val, str) or (
+            isinstance(sample_val, (list, np.ndarray))
+            and all(isinstance(item, dict) and "role" in item for item in sample_val)
         ):
+            instruction_valid_columns.append(col)
+            response_valid_columns.append(col)
+        if isinstance(sample_val, (list, np.ndarray)) and all(
+            isinstance(item, str) for item in sample_val
+        ):
+            response_valid_columns.append(col)
+    return instruction_valid_columns, response_valid_columns
+def load_dataset_from_hub(repo_id: str, num_rows: int = 10):
+    if not repo_id:
         raise gr.Error("Hub repo id is required")
+    subsets = get_dataset_config_names(repo_id)
+    ds_dict = load_dataset(repo_id, subsets[0])
+    splits = get_dataset_split_names(repo_id, subsets[0])
     ds = ds_dict[splits[0]]
     if num_rows:
         ds = ds.select(range(num_rows))
+    dataframe = ds.to_pandas()
+    instruction_valid_columns, response_valid_columns = get_valid_columns(dataframe)
     return (
+        dataframe,
+        gr.Dropdown(choices=instruction_valid_columns, label="Instruction column"),
+        gr.Dropdown(choices=response_valid_columns, label="Response column"),
     )
 def define_evaluation_aspects(task_type: str):
+    if task_type == "ultrafeedback":
         return gr.Dropdown(
             value=["overall-rating"],
             choices=["helpfulness", "truthfulness", "overall-rating", "honesty"],
         return gr.Dropdown(interactive=False, visible=False)
 def evaluate_instruction_response(
+    dataframe: pd.DataFrame,
+    aspects: list[str],
+    instruction_column: str,
+    response_columns: str,
+    num_rows: int = 10,
+    is_sample: bool = False,
+    progress=gr.Progress(),
 ):
+    progress(0.0, desc="Evaluating instructions and responses")
+    data = process_columns(dataframe, instruction_column, response_columns)
+    num_generations = len(data[0]["generations"])
+    evaluated_results = []
+    for entry in data:
+        result_row = {
+            "instruction": entry["instruction"],
+            "generations": entry["generations"],
+        }
+        for aspect in aspects:
+            result_row[f"ratings_{aspect}"] = None
+            result_row[f"rationale_for_ratings_{aspect}"] = None
+            if aspect in ["truthfulness", "helpfulness"]:
+                result_row[f"type_{aspect}"] = None
+                result_row[f"rationale_for_type_{aspect}"] = None
+        result_row["model_name"] = None
+        evaluated_results.append(result_row)
+    batch_size = DEFAULT_BATCH_SIZE
+    total_steps: int = len(aspects) * num_rows
+    # evaluate instructions and responses
+    for aspect in aspects:
+        ultrafeedback_evaluator = get_ultrafeedback_evaluator(aspect, is_sample)
+        n_processed = 0
+        while n_processed < num_rows:
+            progress(
+                (len(aspects) * n_processed) / total_steps,
+                total=total_steps,
+                desc=f"Evaluating aspect: {aspect}",
+            )
+            remaining_rows = num_rows - n_processed
+            batch_size = min(batch_size, remaining_rows)
+            inputs = data[n_processed : n_processed + batch_size]
+            batch_results = list(ultrafeedback_evaluator.process(inputs=inputs))
+            for j, result in enumerate(batch_results[0]):
+                idx = n_processed + j
+                evaluated_results[idx][f"ratings_{aspect}"] = pad_or_truncate_list(
+                    result.get("ratings"), num_generations
+                )
+                evaluated_results[idx]["model_name"] = result.get("model_name")
+                if aspect in ["truthfulness", "helpfulness"]:
+                    evaluated_results[idx][f"type_{aspect}"] = pad_or_truncate_list(
+                        result.get("types"), num_generations
+                    )
+                    evaluated_results[idx][f"rationale_for_type_{aspect}"] = (
+                        pad_or_truncate_list(result.get("rationales"), num_generations)
+                    )
+                    evaluated_results[idx][f"rationale_for_ratings_{aspect}"] = (
+                        pad_or_truncate_list(
+                            result.get("rationales-for-ratings"), num_generations
+                        )
+                    )
+                else:
+                    evaluated_results[idx][f"rationale_for_ratings_{aspect}"] = (
+                        pad_or_truncate_list(result.get("rationales"), num_generations)
+                    )
+            n_processed += batch_size
+    # create final dataset
+    dataframe = pd.DataFrame(evaluated_results)
+    progress(1.0, desc="Dataset evaluation completed")
+    return dataframe
 def evaluate_custom(
+    dataframe: pd.DataFrame,
+    prompt_template: str,
+    structured_output: dict,
+    num_rows: int = 10,
+    is_sample: bool = False,
+    progress=gr.Progress(),
 ):
+    progress(0.0, desc="Evaluating dataset")
+    columns = extract_column_names(prompt_template)
+    input_columns = {column: column_to_list(dataframe, column) for column in columns}
+    custom_evaluator = get_custom_evaluator(
+        prompt_template, structured_output, columns, is_sample
+    )
+    batch_size = DEFAULT_BATCH_SIZE
+    # evaluate the data
+    n_processed = 0
+    evaluation_results = []
+    while n_processed < num_rows:
+        progress(
+            n_processed / num_rows,
+            desc="Evaluating dataset",
+        )
+        remaining_rows = num_rows - n_processed
+        batch_size = min(batch_size, remaining_rows)
+        inputs = []
+        for idx in range(n_processed, n_processed + batch_size):
+            input = {column: input_columns[column][idx] for column in input_columns}
+            inputs.append(input)
+        batch = list(custom_evaluator.process(inputs=inputs))
+        evaluation_results.extend(batch[0])
+        n_processed += batch_size
+    # create final dataset
+    distiset_results = []
+    for result in evaluation_results:
+        record = {key: result[key] for key in result if key != "distilabel_metadata"}
+        distiset_results.append(record)
+    dataframe = pd.DataFrame(distiset_results)
+    progress(1.0, desc="Dataset evaluation completed")
+    return dataframe
+def _evaluate_dataset(
+    dataframe: pd.DataFrame,
     eval_type: str,
     aspects_instruction_response: list[str],
+    instruction_instruction_response: str,
+    response_instruction_response: str,
     prompt_template: str,
     structured_output: dict,
+    num_rows: int = 10,
+    is_sample: bool = False,
 ):
+    if eval_type == "ultrafeedback":
+        dataframe = evaluate_instruction_response(
+            dataframe=dataframe,
+            aspects=aspects_instruction_response,
+            instruction_column=instruction_instruction_response,
+            response_columns=response_instruction_response,
+            num_rows=num_rows,
+            is_sample=is_sample,
+        )
+    else:
+        dataframe = evaluate_custom(
+            dataframe=dataframe,
+            prompt_template=prompt_template,
+            structured_output=structured_output,
+            num_rows=num_rows,
+            is_sample=is_sample,
         )
+    return dataframe
+def evaluate_sample_dataset(
     repo_id: str,
     eval_type: str,
     aspects_instruction_response: list[str],
     instruction_instruction_response: str,
     response_instruction_response: str,
     prompt_template: str,
     structured_output: dict,
 ):
+    dataframe, _, _ = load_dataset_from_hub(repo_id, num_rows=10)
+    dataframe = _evaluate_dataset(
+        dataframe=dataframe,
+        eval_type=eval_type,
+        aspects_instruction_response=aspects_instruction_response,
+        instruction_instruction_response=instruction_instruction_response,
+        response_instruction_response=response_instruction_response,
+        prompt_template=prompt_template,
+        structured_output=structured_output,
+        num_rows=10,
+        is_sample=True,
     )
+    return dataframe
+def push_dataset_to_hub(
+    dataframe: pd.DataFrame, org_name: str, repo_name: str, oauth_token, private
+):
+    repo_id = validate_push_to_hub(org_name, repo_name)
+    distiset = Distiset({"default": Dataset.from_pandas(dataframe)})
+    distiset.push_to_hub(
+        repo_id=repo_id,
+        private=private,
+        include_script=False,
+        token=oauth_token.token,
+        create_pr=False,
+    )
+def push_dataset(
     org_name: str,
     repo_name: str,
     private: bool,
     num_rows: int,
     original_repo_id: str,
     eval_type: str,
     aspects_instruction_response: list[str],
     instruction_instruction_response: str,
     response_instruction_response: str,
     prompt_template: str,
     structured_output: dict,
+    oauth_token: Union[gr.OAuthToken, None] = None,
+    progress=gr.Progress(),
+) -> pd.DataFrame:
+    dataframe, _, _ = load_dataset_from_hub(original_repo_id, num_rows=num_rows)
+    dataframe = _evaluate_dataset(
+        dataframe=dataframe,
+        eval_type=eval_type,
+        aspects_instruction_response=aspects_instruction_response,
+        instruction_instruction_response=instruction_instruction_response,
+        response_instruction_response=response_instruction_response,
+        prompt_template=prompt_template,
+        structured_output=structured_output,
+        num_rows=num_rows,
     )
+    push_dataset_to_hub(dataframe, org_name, repo_name, oauth_token, private)
+    try:
+        progress(0.1, desc="Setting up user and workspace")
+        client = get_argilla_client()
+        hf_user = HfApi().whoami(token=oauth_token.token)["name"]
+        if eval_type == "ultrafeedback":
+            num_generations = len((dataframe["generations"][0]))
+            fields = [
+                rg.ChatField(
+                    name=f"chat_{i}",
+                    title=f"Chat {i+1}",
+                    description=f"User and assistant conversation for generation {i+1}",
+                )
+                for i in range(num_generations)
+            ]
+            questions = []
+            for i in range(num_generations):
+                for aspect in aspects_instruction_response:
+                    questions.append(
+                        rg.RatingQuestion(
+                            name=f"ratings_{aspect}_{i}",
+                            values=list(range(11)),
+                            title=f"Ratings for {aspect} for response {i+1}",
+                            required=True,
+                        )
+                    )
+                    questions.append(
+                        rg.TextQuestion(
+                            name=f"rationale_for_ratings_{aspect}_{i}",
+                            title=f"Rationale for ratings for {aspect} for response {i+1}",
+                            required=False,
+                            use_markdown=True,
+                        )
+                    )
+                    if aspect in ["truthfulness", "helpfulness"]:
+                        questions.append(
+                            rg.RatingQuestion(
+                                name=f"type_{aspect}_{i}",
+                                values=list(range(1, 6)),
+                                title=f"The type of the response {i+1} for {aspect}",
+                                required=True,
+                            )
+                        )
+                        questions.append(
+                            rg.TextQuestion(
+                                name=f"rationale_for_type_{aspect}_{i}",
+                                title=f"Rationale for type of the response {i+1} for {aspect}",
+                                required=False,
+                                use_markdown=True,
+                            )
+                        )
+            metadata = [
+                rg.IntegerMetadataProperty(
+                    name="instruction_length", title="Instruction length"
+                ),
+            ]
+            for i in range(num_generations):
+                metadata.append(
+                    rg.IntegerMetadataProperty(
+                        name=f"response_{i}_length", title=f"Response {i+1} length"
+                    )
+                )
+            vectors = [
+                rg.VectorField(
+                    name="instruction_embeddings",
+                    dimensions=get_sentence_embedding_dimensions(),
+                )
+            ]
+            settings = rg.Settings(
+                fields=fields,
+                questions=questions,
+                metadata=metadata,
+                vectors=vectors,
+                guidelines="Please review the conversation and provide an evaluation.",
+            )
+            dataframe["instruction_length"] = dataframe["instruction"].apply(len)
+            for i in range(num_generations):
+                dataframe[f"response_{i}_length"] = dataframe["generations"].apply(
+                    lambda gens: len(gens[i]) if i < len(gens) else 0
+                )
+            dataframe["instruction_embeddings"] = get_embeddings(
+                dataframe["instruction"].to_list()
+            )
+            progress(0.5, desc="Creating dataset")
+            rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
+            if rg_dataset is None:
+                rg_dataset = rg.Dataset(
+                    name=repo_name,
+                    workspace=hf_user,
+                    settings=settings,
+                    client=client,
+                )
+                rg_dataset = rg_dataset.create()
+            progress(0.7, desc="Pushing dataset to Argilla")
+            hf_dataset = Dataset.from_pandas(dataframe)
+            records = []
+            for sample in hf_dataset:
+                fields = {}
+                metadata = {"instruction_length": sample.get("instruction_length", 0)}
+                vectors = {
+                    "instruction_embeddings": sample.get("instruction_embeddings", [])
+                }
+                suggestions = []
+                generations = sample.get("generations", [])
+                for i in range(num_generations):
+                    fields[f"chat_{i}"] = [
+                        {"role": "user", "content": sample.get("instruction", "")},
+                        {"role": "assistant", "content": generations[i]},
+                    ]
+                    metadata[f"response_{i}_length"] = sample.get(
+                        f"response_{i}_length", 0
+                    )
+                    for aspect in aspects_instruction_response:
+                        ratings = sample.get(f"ratings_{aspect}", [])
+                        rationales = sample.get(f"rationale_for_ratings__{aspect}", [])
+                        rating_value = (
+                            ratings[i]
+                            if ratings and isinstance(ratings[i], int)
+                            else None
+                        )
+                        rationale_value = (
+                            rationales[i]
+                            if rationales and isinstance(rationales[i], str)
+                            else None
+                        )
+                        if rating_value is not None:
+                            suggestions.append(
+                                rg.Suggestion(
+                                    question_name=f"ratings_{aspect}_{i}",
+                                    value=rating_value,
+                                )
+                            )
+                        if rationale_value is not None:
+                            suggestions.append(
+                                rg.Suggestion(
+                                    question_name=f"rationale_for_ratings_{aspect}_{i}",
+                                    value=rationale_value,
+                                )
+                            )
+                        if aspect in ["truthfulness", "helpfulness"]:
+                            types = sample.get(f"type_{aspect}", [])
+                            rationale_types = sample.get(
+                                f"rationale_for_type_{aspect}", []
+                            )
+                            type_value = (
+                                types[i]
+                                if types and isinstance(types[i], int)
+                                else None
+                            )
+                            rationale_type_value = (
+                                rationale_types[i]
+                                if rationale_types
+                                and isinstance(rationale_types[i], str)
+                                else None
+                            )
+                            if type_value is not None:
+                                suggestions.append(
+                                    rg.Suggestion(
+                                        question_name=f"type_{aspect}_{i}",
+                                        value=type_value,
+                                    )
+                                )
+                            if rationale_type_value is not None:
+                                suggestions.append(
+                                    rg.Suggestion(
+                                        question_name=f"rationale_for_type_{aspect}_{i}",
+                                        value=rationale_type_value,
+                                    )
+                                )
+                records.append(
+                    rg.Record(
+                        fields=fields,
+                        metadata=metadata,
+                        vectors=vectors,
+                        suggestions=suggestions,
+                    )
+                )
+            rg_dataset.records.log(records=records)
+            progress(1.0, desc="Dataset pushed to Argilla")
+        else:
+            columns = extract_column_names(prompt_template)
+            settings = rg.Settings(
+                fields=[
+                    rg.TextField(
+                        name=column,
+                        title=column.capitalize(),
+                        description="The column content",
+                    )
+                    for column in columns
+                ],
+                questions=[
+                    rg.TextQuestion(
+                        name="evaluation",
+                        title="Evaluation",
+                        description="The generated evaluation",
+                        use_markdown=True,
+                    ),
+                ],
+                metadata=[
+                    rg.IntegerMetadataProperty(
+                        name=f"{column}_length", title=f"{column.capitalize()} length"
+                    )
+                    for column in columns
+                ],
+                vectors=[
+                    rg.VectorField(
+                        name=f"{column}_embeddings",
+                        dimensions=get_sentence_embedding_dimensions(),
+                    )
+                    for column in columns
+                ],
+                guidelines="Please review, correct and provide an accurate evaluation.",
+            )
+            for column in columns:
+                dataframe[f"{column}_length"] = dataframe[column].apply(len)
+                dataframe[f"{column}_embeddings"] = get_embeddings(dataframe[column])
+            progress(0.5, desc="Creating dataset")
+            rg_dataset = client.datasets(name=repo_name, workspace=hf_user)
+            if rg_dataset is None:
+                rg_dataset = rg.Dataset(
+                    name=repo_name,
+                    workspace=hf_user,
+                    settings=settings,
+                    client=client,
+                )
+                rg_dataset = rg_dataset.create()
+            progress(0.7, desc="Pushing dataset to Argilla")
+            hf_dataset = Dataset.from_pandas(dataframe)
+            rg_dataset.records.log(
+                records=hf_dataset, mapping={"generation": "evaluation"}
+            )
+            progress(1.0, desc="Dataset pushed to Argilla")
+    except Exception as e:
+        raise gr.Error(f"Error pushing dataset to Argilla: {e}")
+    return ""
+def update_pipeline_code_visibility():
+    return {pipeline_code_ui: gr.Accordion(visible=True)}
 ######################
 with gr.Blocks() as app:
+    with gr.Column() as main_ui:
+        gr.Markdown("## 1. Select your input dataset")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=1):
+                search_in = HuggingfaceHubSearch(
+                    label="Search",
+                    placeholder="Search for a dataset",
+                    search_type="dataset",
+                    sumbit_on_select=True,
                 )
+                load_btn = gr.Button("Load dataset", variant="primary")
+            with gr.Column(scale=3):
+                search_out = gr.HTML(label="Dataset preview")
+        gr.HTML(value="<hr>")
+        gr.Markdown(value="## 2. Configure your task")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=1):
+                eval_type = gr.Dropdown(
+                    label="Evaluation type",
+                    choices=["ultrafeedback", "custom"],
+                    value="ultrafeedback",
+                    multiselect=False,
+                    visible=False,
                 )
+                with gr.Tab("ultrafeedback") as tab_instruction_response:
+                    aspects_instruction_response = define_evaluation_aspects(
+                        "ultrafeedback"
+                    )
+                    instruction_instruction_response = gr.Dropdown(
+                        label="Instruction Column",
+                        interactive=True,
+                        multiselect=False,
+                        allow_custom_value=False,
+                    )
+                    response_instruction_response = gr.Dropdown(
+                        label="Response Column",
+                        interactive=True,
+                        multiselect=True,
+                        allow_custom_value=False,
+                    )
+                    tab_instruction_response.select(
+                        fn=lambda: "ultrafeedback",
+                        inputs=[],
+                        outputs=[eval_type],
+                    )
+                with gr.Tab("custom") as tab_custom:
+                    aspects_custom = define_evaluation_aspects("custom")
+                    prompt_template = gr.Code(
+                        label="Prompt template",
+                        value="Evaluate {{column_1}} based on {{column_2}}.",
+                        language="markdown",
+                        interactive=True,
+                    )
+                    structured_output = gr.Code(
+                        label="Structured output",
+                        value=json.dumps(
+                            {
+                                "type": "object",
+                                "properties": {
+                                    "quality": {"type": "integer"},
+                                    "clarity": {"type": "integer"},
+                                    "relevance": {"type": "integer"},
+                                },
+                            },
+                            indent=4,
+                        ),
+                        language="json",
+                        interactive=True,
+                    )
+                    tab_custom.select(
+                        fn=lambda: "custom",
+                        inputs=[],
+                        outputs=[eval_type],
+                    )
+                btn_apply_to_sample_dataset = gr.Button(
+                    "Refresh dataset", variant="secondary", size="sm"
                 )
+            with gr.Column(scale=3):
+                dataframe = gr.Dataframe(
+                    headers=["prompt", "completion", "evaluation"],
+                    wrap=False,
+                    height=500,
+                    interactive=False,
                 )
+        gr.HTML(value="<hr>")
+        gr.Markdown(value="## 3. Evaluate your dataset")
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=2):
+                org_name = get_org_dropdown()
+                repo_name = gr.Textbox(
+                    label="Repo name",
+                    placeholder="dataset_name",
+                    value=f"my-distiset-{str(uuid.uuid4())[:8]}",
                     interactive=True,
                 )
+                num_rows = gr.Number(
+                    label="Number of rows",
+                    value=10,
                     interactive=True,
+                    scale=1,
                 )
+                private = gr.Checkbox(
+                    label="Private dataset",
+                    value=False,
+                    interactive=True,
+                    scale=1,
                 )
+                btn_push_to_hub = gr.Button("Push to Hub", variant="primary", scale=2)
+            with gr.Column(scale=3):
+                success_message = gr.Markdown(visible=True)
+                with gr.Accordion(
+                    "Do you want to go further? Customize and run with Distilabel",
+                    open=False,
+                    visible=False,
+                ) as pipeline_code_ui:
+                    code = generate_pipeline_code(
+                            repo_id=search_in.value,
+                            aspects=aspects_instruction_response.value,
+                            instruction_column=instruction_instruction_response,
+                            response_columns=response_instruction_response,
+                            prompt_template=prompt_template.value,
+                            structured_output=structured_output.value,
+                            num_rows=num_rows.value,
+                            eval_type=eval_type.value,
+                        )
+                    pipeline_code = gr.Code(
+                        value=code,
+                        language="python",
+                        label="Distilabel Pipeline Code",
+                    )
+    search_in.submit(fn=get_iframe, inputs=search_in, outputs=search_out)
     load_btn.click(
+        fn=load_dataset_from_hub,
         inputs=[search_in],
         outputs=[
             dataframe,
             instruction_instruction_response,
             response_instruction_response,
         ],
     )
     btn_apply_to_sample_dataset.click(
+        fn=evaluate_sample_dataset,
         inputs=[
             search_in,
             eval_type,
             aspects_instruction_response,
             instruction_instruction_response,
             response_instruction_response,
             prompt_template,
         ],
         outputs=dataframe,
     )
     btn_push_to_hub.click(
+        fn=validate_argilla_user_workspace_dataset,
+        inputs=[repo_name],
+        outputs=[success_message],
+        show_progress=True,
+    ).then(
+        fn=validate_push_to_hub,
+        inputs=[org_name, repo_name],
+        outputs=[success_message],
+        show_progress=True,
+    ).success(
+        fn=hide_success_message,
+        outputs=[success_message],
+        show_progress=True,
+    ).success(
+        fn=push_dataset,
         inputs=[
             org_name,
             repo_name,
             num_rows,
             search_in,
             eval_type,
             aspects_instruction_response,
             instruction_instruction_response,
             response_instruction_response,
             prompt_template,
             structured_output,
         ],
+        outputs=[success_message],
+        show_progress=True,
+    ).success(
+        fn=show_success_message,
+        inputs=[org_name, repo_name],
+        outputs=[success_message],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[
+            search_in,
+            aspects_instruction_response,
+            instruction_instruction_response,
+            response_instruction_response,
+            prompt_template,
+            structured_output,
+            num_rows,
+            eval_type,
+        ],
+        outputs=[pipeline_code],
+    ).success(
+        fn=update_pipeline_code_visibility,
+        inputs=[],
+        outputs=[pipeline_code_ui],
     )
+    app.load(fn=swap_visibility, outputs=main_ui)
     app.load(fn=get_org_dropdown, outputs=[org_name])

src/distilabel_dataset_generator/apps/sft.py CHANGED Viewed

@@ -499,6 +499,10 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
         fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     ).success(
         fn=update_pipeline_code_visibility,
         inputs=[],

         fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[system_prompt, num_turns, num_rows],
+        outputs=[pipeline_code],
     ).success(
         fn=update_pipeline_code_visibility,
         inputs=[],

src/distilabel_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -526,6 +526,17 @@ with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
         fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
     ).success(
         fn=update_pipeline_code_visibility,
         inputs=[],

         fn=show_success_message,
         inputs=[org_name, repo_name],
         outputs=[success_message],
+    ).success(
+        fn=generate_pipeline_code,
+        inputs=[
+            system_prompt,
+            difficulty,
+            clarity,
+            labels,
+            num_labels,
+            num_rows,
+        ],
+        outputs=[pipeline_code],
     ).success(
         fn=update_pipeline_code_visibility,
         inputs=[],

src/distilabel_dataset_generator/pipelines/eval.py ADDED Viewed

	@@ -0,0 +1,205 @@

+from typing import List
+from datasets import get_dataset_config_names, get_dataset_split_names
+from distilabel.llms import InferenceEndpointsLLM
+from distilabel.steps.tasks import (
+    UltraFeedback,
+    TextGeneration,
+)
+from src.distilabel_dataset_generator.pipelines.base import (
+    MODEL,
+    _get_next_api_key,
+)
+from src.distilabel_dataset_generator.utils import extract_column_names
+def get_ultrafeedback_evaluator(aspect, is_sample):
+    ultrafeedback_evaluator = UltraFeedback(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=_get_next_api_key(),
+            generation_kwargs={
+                "temperature": 0.7,
+                "max_new_tokens": 256 if is_sample else 2048,
+            },
+        ),
+        aspect=aspect,
+    )
+    ultrafeedback_evaluator.load()
+    return ultrafeedback_evaluator
+def get_custom_evaluator(prompt_template, structured_output, columns, is_sample):
+    custom_evaluator = TextGeneration(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=_get_next_api_key(),
+            structured_output={"format": "json", "schema": structured_output},
+            generation_kwargs={
+                "temperature": 0.7,
+                "max_new_tokens": 256 if is_sample else 2048,
+            },
+        ),
+        template=prompt_template,
+        columns=columns
+    )
+    custom_evaluator.load()
+    return custom_evaluator
+def generate_ultrafeedback_pipeline_code(
+    repo_id, subset, split, aspects, instruction_column, response_columns, num_rows
+):
+    if len(aspects) == 1:
+        code = f"""
+# Requirements: `pip install distilabel[hf-inference-endpoints]`
+import os
+from datasets import load_dataset
+from distilabel.pipeline import Pipeline
+from distilabel.steps import LoadDataFromDicts
+from distilabel.steps.tasks import UltraFeedback
+from distilabel.llms import InferenceEndpointsLLM
+MODEL = "{MODEL}"
+os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
+hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}[:{num_rows}]")
+data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
+with Pipeline(name="ultrafeedback") as pipeline:
+    load_the_dataset = LoadDataFromDicts(
+        data = data,
+    )
+    ultrafeedback_evaluator = UltraFeedback(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=os.environ["HF_TOKEN"],
+            generation_kwargs={{
+                "temperature": 0.7,
+                "max_new_tokens": 2048,
+            }},
+        ),
+        aspect=aspect,
+    )
+    load_the_dataset >> ultrafeedback_evaluator
+if __name__ == "__main__":
+    distiset = pipeline.run()
+"""
+    else:
+        code = f"""
+# Requirements: `pip install distilabel[hf-inference-endpoints]`
+import os
+from distilabel.pipeline import Pipeline
+from distilabel.steps import LoadDataFromDicts, CombineOutputs
+from distilabel.steps.tasks import UltraFeedback
+from distilabel.llms import InferenceEndpointsLLM
+MODEL = "{MODEL}"
+os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
+hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}")
+data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
+with Pipeline(name="ultrafeedback") as pipeline:
+    load_the_dataset = LoadDataFromDicts(
+        data = data,
+    )
+    tasks = []
+    for aspect in aspects:
+        evaluate_responses = UltraFeedback(
+            name=f"evaluate-responses-{{aspect}}",
+            aspect=aspect,
+            llm=InferenceEndpointsLLM(
+                model_id=MODEL,
+                tokenizer_id=MODEL,
+                api_key=os.environ["HF_TOKEN"],
+                generation_kwargs={{
+                    "temperature": 0.7,
+                    "max_new_tokens": 2048,
+                }},
+            output_mappings={{
+                "ratings": f"ratings_{{aspect}}",
+                "types": f"type_{{aspect}}",
+                "rationales": f"rationales_for_types_{{aspect}}",
+                "rationales-for-ratings": f"rationales_for_ratings_{{aspect}}",
+            }} if aspect in ["truthfulness", "helpfulness"] else {{"rationales": f"rationales_{{aspect}}", "ratings": f"ratings_{{aspect}}"}},
+        )
+        tasks.append(evaluate_responses)
+    combine_outputs = CombineOutputs()
+    load_the_dataset >> tasks >> combine_outputs
+if __name__ == "__main__":
+    distiset = pipeline.run()
+"""
+    return code
+def generate_custom_pipeline_code(
+    repo_id, subset, split, prompt_template, structured_output, num_rows
+):
+    columns = extract_column_names(structured_output)
+    code = f"""
+# Requirements: `pip install distilabel[hf-inference-endpoints, instructor]`
+import os
+from distilabel.pipeline import Pipeline
+from distilabel.steps import LoadDataFromHub
+from distilabel.steps.tasks import TextGeneration
+from distilabel.llms import InferenceEndpointsLLM
+MODEL = "{MODEL}"
+CUSTOM_TEMPLATE = "{prompt_template}"
+os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
+with Pipeline(name="custom-evaluation") as pipeline:
+    load_the_dataset = LoadDataFromHub(
+        repo_id="{repo_id}",
+        config="{subset}",
+        split="{split}",
+        num_examples={num_rows},
+        batch_size=2
+    )
+    custom_evaluator = TextGeneration(
+        llm=InferenceEndpointsLLM(
+            model_id=MODEL,
+            tokenizer_id=MODEL,
+            api_key=os.environ["HF_TOKEN"],
+            structured_output={{"format": "json", "schema": {structured_output}}},
+            generation_kwargs={{
+                "temperature": 0.7,
+                "max_new_tokens": 2048,
+            }},
+        ),
+        template=CUSTOM_TEMPLATE,
+        columns={columns}
+    )
+    load_the_dataset >> custom_evaluator
+if __name__ == "__main__":
+    distiset = pipeline.run()
+"""
+    return code
+def generate_pipeline_code(repo_id, aspects, instruction_column, response_columns, prompt_template, structured_output, num_rows, eval_type):
+    if repo_id is None:
+        subset = "default"
+        split = "train"
+    else:
+        subset = get_dataset_config_names(repo_id)[0]
+        split = get_dataset_split_names(repo_id, subset)[0]
+    if eval_type == "ultrafeedback":
+        return generate_ultrafeedback_pipeline_code(repo_id, subset, split, aspects, instruction_column, response_columns, num_rows)
+    return generate_custom_pipeline_code(repo_id, subset, split, prompt_template, structured_output, num_rows)

src/distilabel_dataset_generator/utils.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import os
 from typing import List, Optional, Union
 import argilla as rg
 import gradio as gr
 from gradio.oauth import (
     OAUTH_CLIENT_ID,
     OAUTH_CLIENT_SECRET,
@@ -11,6 +14,7 @@ from gradio.oauth import (
     get_space,
 )
 from huggingface_hub import whoami
 _LOGGED_OUT_CSS = ".main_ui_logged_out{opacity: 0.3; pointer-events: none}"
@@ -132,6 +136,91 @@ def get_argilla_client() -> Union[rg.Argilla, None]:
     except Exception:
         return None
 def get_preprocess_labels(labels: Optional[List[str]]) -> List[str]:
     return list(set([label.lower().strip() for label in labels])) if labels else []

+import json
 import os
 from typing import List, Optional, Union
 import argilla as rg
 import gradio as gr
+import numpy as np
+import pandas as pd
 from gradio.oauth import (
     OAUTH_CLIENT_ID,
     OAUTH_CLIENT_SECRET,
     get_space,
 )
 from huggingface_hub import whoami
+from jinja2 import Environment, meta
 _LOGGED_OUT_CSS = ".main_ui_logged_out{opacity: 0.3; pointer-events: none}"
     except Exception:
         return None
 def get_preprocess_labels(labels: Optional[List[str]]) -> List[str]:
     return list(set([label.lower().strip() for label in labels])) if labels else []
+def column_to_list(dataframe: pd.DataFrame, column_name: str) -> List[str]:
+    if column_name in dataframe.columns:
+        return dataframe[column_name].tolist()
+    else:
+        raise ValueError(f"Column '{column_name}' does not exist.")
+def process_columns(
+    dataframe,
+    instruction_column: str,
+    response_columns: Union[str, List[str]],
+) -> List[dict]:
+    instruction_column = [instruction_column]
+    if isinstance(response_columns, str):
+        response_columns = [response_columns]
+    data = []
+    for _, row in dataframe.iterrows():
+        instruction = ""
+        for col in instruction_column:
+            value = row[col]
+            if isinstance(value, (list, np.ndarray)):
+                user_contents = [d["content"] for d in value if d.get("role") == "user"]
+                if user_contents:
+                    instruction = user_contents[-1]
+            elif isinstance(value, str):
+                try:
+                    parsed_message = json.loads(value)
+                    user_contents = [
+                        d["content"] for d in parsed_message if d.get("role") == "user"
+                    ]
+                    if user_contents:
+                        instruction = user_contents[-1]
+                except json.JSONDecodeError:
+                    instruction = value
+            else:
+                instruction = ""
+        generations = []
+        for col in response_columns:
+            value = row[col]
+            if isinstance(value, (list, np.ndarray)):
+                if all(isinstance(item, dict) and "role" in item for item in value):
+                    assistant_contents = [
+                        d["content"] for d in value if d.get("role") == "assistant"
+                    ]
+                    if assistant_contents:
+                        generations.append(assistant_contents[-1])
+                else:
+                    generations.extend(value)
+            elif isinstance(value, str):
+                try:
+                    parsed_message = json.loads(value)
+                    assistant_contents = [
+                        d["content"]
+                        for d in parsed_message
+                        if d.get("role") == "assistant"
+                    ]
+                    if assistant_contents:
+                        generations.append(assistant_contents[-1])
+                except json.JSONDecodeError:
+                    generations.append(value)
+            else:
+                pass
+        data.append({"instruction": instruction, "generations": generations})
+    return data
+def extract_column_names(prompt_template: str) -> List[str]:
+    env = Environment()
+    parsed_content = env.parse(prompt_template)
+    variables = meta.find_undeclared_variables(parsed_content)
+    return list(variables)
+def pad_or_truncate_list(lst, target_length):
+    lst = lst or []
+    lst_length = len(lst)
+    if lst_length >= target_length:
+        return lst[-target_length:]
+    else:
+        return lst + [None] * (target_length - lst_length)