Spaces:

JeffYang52415
/

LLMEval-Dataset-Parser

Running

File size: 15,984 Bytes

import secrets
from functools import lru_cache
from typing import Any

import gradio as gr

from llmdataparser import ParserRegistry
from llmdataparser.base_parser import (
    VALID_CATEGORIES,
    DatasetDescription,
    DatasetParser,
    EvaluationMetric,
    ParseEntry,
)


@lru_cache(maxsize=32)
def get_parser_instance(parser_name: str) -> DatasetParser[Any]:
    """Get a cached parser instance by name."""
    return ParserRegistry.get_parser(parser_name)


def get_available_splits(parser: DatasetParser[Any]) -> list[str] | None:
    """Get available splits for the selected parser after loading."""
    if not hasattr(parser, "split_names") or not parser.split_names:
        return None
    return list(parser.split_names)


def get_available_tasks(parser: DatasetParser[Any]) -> list[str]:
    """Get available tasks for the selected parser."""
    if not hasattr(parser, "task_names"):
        return ["default"]
    return list(parser.task_names)


def format_entry_attributes(entry: ParseEntry) -> str:
    """Format all attributes of a ParseEntry except question and answer."""
    from dataclasses import fields

    # Get all field names from the dataclass
    field_names = [field.name for field in fields(entry)]
    # Filter out question and answer
    filtered_fields = [
        name for name in field_names if name not in ["question", "answer"]
    ]
    # Build the formatted string
    return "\n".join(f"{name}: {getattr(entry, name)}" for name in filtered_fields)


def load_and_parse(
    parser_name: str, task_name: str | None, split_name: str | None
) -> tuple[int, str, str, str, gr.Dropdown, str]:
    """Load and parse the dataset, return the first entry and available splits."""
    try:
        parser = get_parser_instance(parser_name)

        # Load the dataset
        parser.load(
            task_name=task_name if task_name != "default" else None,
            split=split_name,
            trust_remote_code=True,
        )

        # Get available splits after loading
        available_splits = get_available_splits(parser)

        # Parse the dataset
        parser.parse(split_names=split_name, force=True)

        # Get parsed data
        parsed_data = parser.get_parsed_data

        split_dropdown = gr.Dropdown(
            choices=available_splits,
            label="Select Split",
            interactive=True,
            value=None,
            allow_custom_value=True,
        )

        info = parser.__repr__()
        if not parsed_data:
            return 0, "", "", "", split_dropdown, info

        # Get the first entry
        first_entry = parsed_data[0]

        return (
            0,  # Return first index instead of list of indices
            first_entry.question,
            first_entry.answer,
            format_entry_attributes(first_entry),
            split_dropdown,
            info,
        )
    except Exception as e:
        # Make the error message more user-friendly and detailed
        error_msg = f"Failed to load dataset: {str(e)}\nParser: {parser_name}\nTask: {task_name}\nSplit: {split_name}"
        return 0, error_msg, "", "", [], ""


def update_entry(
    parsed_data_index: int | None, parser_name: str
) -> tuple[str, str, str]:
    """Update the displayed entry based on the selected index."""
    try:
        if not parser_name:
            return "Please select a parser first", "", ""

        parser = get_parser_instance(parser_name)
        parsed_data = parser.get_parsed_data

        if not parsed_data:
            return "No data available", "", ""

        if parsed_data_index is None:
            # Random selection using secrets instead of random
            random_index = secrets.randbelow(len(parsed_data))
            entry = parsed_data[random_index]
        else:
            # Ensure index is within bounds
            index = max(0, min(parsed_data_index, len(parsed_data) - 1))
            entry = parsed_data[index]

        return (
            entry.question,
            entry.answer,
            format_entry_attributes(entry),
        )
    except Exception as e:
        return f"Error: {str(e)}", "", ""


def update_parser_options(parser_name: str) -> tuple[gr.Dropdown, gr.Dropdown, str]:
    """Update available tasks and splits for the selected parser."""
    try:
        parser = get_parser_instance(parser_name)
        tasks = get_available_tasks(parser)
        default_task = getattr(parser, "_default_task", "default")

        # Update task dropdown
        task_dropdown = gr.Dropdown(
            choices=tasks,
            value=default_task,
            label="Select Task",
            interactive=True,
            allow_custom_value=True,
        )

        # Update split dropdown - Note the value is now explicitly None
        splits = get_available_splits(parser)
        split_dropdown = gr.Dropdown(
            choices=splits,
            label="Select Split",
            interactive=True,
            value=None,
            allow_custom_value=True,
        )

        info = parser.__repr__()
        return task_dropdown, split_dropdown, info
    except Exception as e:
        return (
            gr.Dropdown(choices=["default"], value="default"),
            gr.Dropdown(choices=[]),
            f"Error: {str(e)}",
        )


def clear_parser_cache() -> None:
    """Clear the parser cache."""
    get_parser_instance.cache_clear()


def format_dataset_description(description: DatasetDescription) -> str:
    """Format DatasetDescription into a readable string."""
    formatted = [
        f"# {description.name}",
        f"\n**Purpose**: {description.purpose}",
        f"\n**Language**: {description.language}",
        f"\n**Format**: {description.format}",
        f"\n**Source**: {description.source}",
        f"\n**Characteristics**: {description.characteristics}",
    ]

    if description.citation:
        formatted.append(f"\n**Citation**:\n```\n{description.citation}\n```")

    if description.additional_info:
        formatted.append("\n**Additional Information**:")
        for key, value in description.additional_info.items():
            formatted.append(f"- {key}: {value}")

    return "\n".join(formatted)


def get_primary_metrics(metrics: list[EvaluationMetric]) -> list[str]:
    """Get list of primary metric names."""
    return [metric.name for metric in metrics if metric.primary]


def format_metric_details(metric: EvaluationMetric) -> str:
    """Format a single EvaluationMetric into a readable string."""
    return f"""# {metric.name}<br>
                **Type**: {metric.type}<br>
                **Description**: {metric.description}"""


def update_dataset_info(parser_name: str) -> tuple:
    """Update dataset description and evaluation metrics information."""
    try:
        parser = get_parser_instance(parser_name)
        description = parser.get_dataset_description()
        metrics = parser.get_evaluation_metrics()

        # Format description
        desc_text = format_dataset_description(description)

        # Get primary metrics for dropdown
        primary_metrics = get_primary_metrics(metrics)

        # Format details for first metric (or empty if no metrics)
        first_metric = metrics[0] if metrics else None
        metric_details = format_metric_details(first_metric) if first_metric else ""

        return (
            gr.Markdown(value=desc_text),
            gr.Dropdown(
                choices=primary_metrics,
                value=primary_metrics[0] if primary_metrics else None,
            ),
            gr.Markdown(value=metric_details),
        )
    except Exception as e:
        return (
            gr.Markdown(value=f"Error loading dataset description: {str(e)}"),
            gr.Dropdown(choices=[]),
            gr.Markdown(value=""),
        )


def update_metric_details(metric_name: str, parser_name: str) -> str:
    """Update the displayed metric details when selection changes."""
    try:
        parser = get_parser_instance(parser_name)
        metrics = parser.get_evaluation_metrics()
        selected_metric = next((m for m in metrics if m.name == metric_name), None)
        return format_metric_details(selected_metric) if selected_metric else ""
    except Exception as e:
        return f"Error loading metric details: {str(e)}"


def get_parser_categories(parser_name: str) -> list[str]:
    """Get categories for a specific parser."""
    try:
        parser = get_parser_instance(parser_name)
        description = parser.get_dataset_description()
        return description.category
    except Exception:
        return []


def filter_parsers_by_category(category: str | None) -> list[str]:
    """Filter available parsers by category."""
    if not category:
        return ParserRegistry.list_parsers()

    filtered_parsers = []
    for parser_name in ParserRegistry.list_parsers():
        categories = get_parser_categories(parser_name)
        if category in categories:
            filtered_parsers.append(parser_name)
    return filtered_parsers


def create_interface() -> gr.Blocks:
    """Create and return the Gradio interface."""
    with gr.Blocks(css="footer {display: none !important}") as demo:
        # Add header section with purpose and GitHub info
        gr.Markdown("""
            # LLM Evaluation Dataset Parser

            ### 🎯 Purpose
            A unified interface for parsing and exploring various LLM benchmark datasets (MMLU, MMLU-Pro, GSM8k, and more).
            This tool helps researchers and developers to:
            - Easily explore different benchmark datasets
            - Access standardized parsing for multiple dataset formats
            - View dataset descriptions and evaluation metrics

            ### 🔗 Links
            - [GitHub Repository](https://github.com/jeff52415/LLMDataParser)
            - [Documentation](https://github.com/jeff52415/LLMDataParser#readme)

            ---
        """)

        # State management
        parser_state = gr.State("")
        dataset_status = gr.Textbox(label="Dataset Status", interactive=False)

        with gr.Tabs():
            with gr.Tab("Dataset Explorer"):
                with gr.Row():
                    with gr.Column(scale=1):
                        # Add category dropdown before parser selection
                        category_dropdown = gr.Dropdown(
                            choices=["All"] + list(VALID_CATEGORIES),
                            label="Filter by Category",
                            value="All",
                            interactive=True,
                        )

                        # Parser selection and controls
                        available_parsers = ParserRegistry.list_parsers()
                        parser_dropdown = gr.Dropdown(
                            choices=available_parsers,
                            label="Select Parser",
                            value=available_parsers[0] if available_parsers else None,
                            interactive=True,
                            allow_custom_value=True,
                        )
                        task_dropdown = gr.Dropdown(
                            choices=["default"],
                            label="Select Task",
                            value="default",
                            interactive=True,
                            allow_custom_value=True,
                        )
                        split_dropdown = gr.Dropdown(
                            choices=[],
                            label="Select Split",
                            interactive=True,
                            value=None,
                            allow_custom_value=True,
                        )
                        load_button = gr.Button(
                            "Load and Parse Dataset", variant="primary"
                        )

                        # Entry selection
                        entry_index = gr.Number(
                            label="Select Entry Index (empty for random)",
                            precision=0,
                            interactive=True,
                        )
                        update_button = gr.Button(
                            "Update/Random Entry", variant="secondary"
                        )

                    with gr.Column(scale=2):
                        # Output displays
                        question_output = gr.Textbox(
                            label="Question", lines=5, show_copy_button=True
                        )
                        answer_output = gr.Textbox(
                            label="Answer", lines=5, show_copy_button=True
                        )
                        attributes_output = gr.Textbox(
                            label="Other Attributes", lines=5, show_copy_button=True
                        )

            with gr.Tab("Dataset Information"):
                with gr.Row():
                    with gr.Column(scale=2):
                        # Dataset description
                        dataset_description = gr.Markdown()

                    with gr.Column(scale=1):
                        # Evaluation metrics
                        gr.Markdown("## Evaluation Metrics")
                        metric_dropdown = gr.Dropdown(
                            label="Select Primary Metric", interactive=True
                        )
                        metric_details = gr.Markdown()

        # Add new event handler for category filtering
        def update_parser_list(category: str) -> gr.Dropdown:
            filtered_parsers = filter_parsers_by_category(
                None if category == "All" else category
            )
            return gr.Dropdown(
                choices=filtered_parsers,
                value=filtered_parsers[0] if filtered_parsers else None,
            )

        category_dropdown.change(
            fn=update_parser_list, inputs=[category_dropdown], outputs=[parser_dropdown]
        )

        # Event handlers
        parser_dropdown.change(
            fn=update_parser_options,
            inputs=parser_dropdown,
            outputs=[
                task_dropdown,
                split_dropdown,
                dataset_status,
            ],
        ).then(lambda x: x, inputs=parser_dropdown, outputs=parser_state).then(
            fn=update_dataset_info,
            inputs=[parser_dropdown],
            outputs=[dataset_description, metric_dropdown, metric_details],
        )

        load_button.click(
            fn=load_and_parse,
            inputs=[parser_dropdown, task_dropdown, split_dropdown],
            outputs=[
                entry_index,
                question_output,
                answer_output,
                attributes_output,
                split_dropdown,
                dataset_status,
            ],
            api_name="load_and_parse",
            show_progress="full",
        ).then(
            fn=update_dataset_info,
            inputs=[parser_dropdown],
            outputs=[dataset_description, metric_dropdown, metric_details],
        )

        update_button.click(
            fn=update_entry,
            inputs=[entry_index, parser_state],
            outputs=[
                question_output,
                answer_output,
                attributes_output,
            ],
            api_name="update_entry",
        )

        metric_dropdown.change(
            fn=update_metric_details,
            inputs=[metric_dropdown, parser_dropdown],
            outputs=metric_details,
        )

    return demo


if __name__ == "__main__":
    print("Starting Gradio interface...")  # Add debug logging
    demo = create_interface()
    try:
        demo.launch(
            show_error=True,  # Changed to True for debugging
        )
    except Exception as e:
        print(f"Error launching Gradio: {e}")  # Add error logging
        import traceback

        traceback.print_exc()