Spaces:

du-lab
/

MLR-Copilot

Running

File size: 14,563 Bytes

85e3d20

import os
import torch
import datasets
import transformers
import json
from .schema import ActionInfo, EnvException, EnhancedJSONEncoder

from reactagent.prompt2model.prompt_parser import MockPromptSpec, TaskType
from reactagent.prompt2model.dataset_retriever import DescriptionDatasetRetriever
from reactagent.prompt2model.dataset_generator import PromptBasedDatasetGenerator, DatasetSplit
from reactagent.prompt2model.dataset_processor import TextualizeProcessor
from reactagent.prompt2model.model_retriever import DescriptionModelRetriever
from reactagent.prompt2model.model_trainer import GenerationModelTrainer
from reactagent.prompt2model.model_executor import GenerationModelExecutor, ModelOutput
from reactagent.prompt2model.model_evaluator import Seq2SeqEvaluator

def generate_dataset(instruction, examples, save_dir, num_train, num_valid, num_test, work_dir = '.', **kwargs):
    try:
        num_train = int(num_train)
        num_valid = int(num_valid)
        num_test = int(num_test)
    except ValueError:
        raise EnvException("Number of examples should be an integer")

    prompt_spec = MockPromptSpec(TaskType.TEXT_GENERATION, instruction=instruction, examples=examples)
    generator = PromptBasedDatasetGenerator()
    dataset_dict = generator.generate_dataset_dict(prompt_spec, {
        DatasetSplit.TRAIN: num_train,
        DatasetSplit.VAL: num_valid,
        DatasetSplit.TEST: num_test
    })

    save_path = os.path.join(work_dir, save_dir)
    dataset_dict.save_to_disk(save_path)

    return f"Dataset successfully generated and saved to {save_path}"

def retrieve_dataset(instruction, save_dir, work_dir = '.', **kwargs):
    prompt_spec = MockPromptSpec(TaskType.TEXT_GENERATION, instruction=instruction, examples="")
    retriever = DescriptionDatasetRetriever()
    dataset_dict = retriever.retrieve_dataset_dict(prompt_spec)

    save_path = os.path.join(work_dir, save_dir)
    dataset_dict.save_to_disk(save_path)

    return f"Dataset successfully generated and saved to {save_path}"

def retrieve_model(instruction, work_dir = '.', **kwargs):
    prompt_spec = MockPromptSpec(TaskType.TEXT_GENERATION, instruction=instruction, examples="")
    retriever = DescriptionModelRetriever(use_bm25=True, use_HyDE=True)
    top_models = retriever.retrieve(prompt_spec)

    return "Top Models:\n" + "".join(f"{i+1}. {model}\n" for i, model in enumerate(top_models))

def process_dataset(instruction, load_dirs, save_dirs, work_dir = '.', **kwargs):
    prompt_spec = MockPromptSpec(TaskType.TEXT_GENERATION, instruction=instruction, examples="")
    load_dirs = load_dirs.split(':')
    save_dirs = save_dirs.split(':')
    if len(load_dirs) != len(save_dirs):
        raise EnvException("Number of load directories should match number of save directories")
    load_paths = [os.path.join(work_dir, load_dir) for load_dir in load_dirs]
    save_paths = [os.path.join(work_dir, save_dir) for save_dir in save_dirs]

    # load the datasets
    dataset_dicts = [datasets.load_from_disk(load_path) for load_path in load_paths]

    # process the datasets
    processor = TextualizeProcessor(has_encoder=True)
    modified_dataset_dicts = processor.process_dataset_dict(prompt_spec, dataset_dicts)

    # save the processed datasets
    for dataset_dict, save_path in zip(modified_dataset_dicts, save_paths):
        dataset_dict.save_to_disk(save_path)

    return f"Data successfully processed and saved to {save_paths}"

def train_model(model_name, load_dirs, result_dir, epochs, batch_size, warmup_steps, weight_decay, learning_rate, work_dir = '.', **kwargs):
    try:
        epochs = int(epochs)
        batch_size = int(batch_size)
        warmup_steps = int(warmup_steps)
        weight_decay = float(weight_decay)
        learning_rate = float(learning_rate)
    except ValueError:
        raise EnvException("Numerical parameters should be integers or floats as appropriate")

    load_dirs = load_dirs.split(':')
    result_dir = os.path.join(work_dir, result_dir)

    # load the datasets
    load_paths = [os.path.join(work_dir, load_dir) for load_dir in load_dirs]
    dataset_dicts = [datasets.load_from_disk(load_path) for load_path in load_paths]

    training_datasets = [dataset_dict["train"] for dataset_dict in dataset_dicts]
    validation_datasets = [dataset_dict["val"] for dataset_dict in dataset_dicts]
        
    trainer = GenerationModelTrainer(
        model_name,
        has_encoder=True,
        executor_batch_size=batch_size,
        tokenizer_max_length=1024,
        sequence_max_length=1280,
    )

    hparams ={
        "output_dir": os.path.join(result_dir, "training_output"),
        "save_strategy": "epoch",
        "num_train_epochs": epochs,
        "per_device_train_batch_size": batch_size,
        "evaluation_strategy": "epoch",
        "warmup_steps": warmup_steps,
        "weight_decay": weight_decay,
        "learning_rate": learning_rate,
    },

    trained_model, trained_tokenizer = trainer.train_model(
        hyperparameter_choices=hparams,
        training_datasets=training_datasets,
        validation_datasets=validation_datasets,
    )

    trained_model.save_pretrained(os.path.join(result_dir, "trained_model"))
    trained_tokenizer.save_pretrained(os.path.join(result_dir, "trained_tokenizer"))

    return f"Model and Tokenizer successfully trained and saved respectively to {result_dir}/trained_model and {result_dir}/trained_tokenizer"

def execute_model(result_dir, load_dirs, save_path, batch_size, input_column, work_dir = '.', **kwargs):
    load_dirs = load_dirs.split(':')
    result_dir = os.path.join(work_dir, result_dir)
    save_path = os.path.join(work_dir, save_path)

    try:
        batch_size = int(batch_size)
    except ValueError:
        raise EnvException("Batch size should be an integer")

    # load the datasets
    load_paths = [os.path.join(work_dir, load_dir) for load_dir in load_dirs]
    dataset_dicts = [datasets.load_from_disk(load_path) for load_path in load_paths]
    test_datasets = [dataset_dict["test"] for dataset_dict in dataset_dicts]
    test_dataset = datasets.concatenate_datasets(test_datasets)

    trained_model_path = os.path.join(result_dir, "trained_model")
    trained_tokenizer_path = os.path.join(result_dir, "trained_tokenizer")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    trained_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(trained_model_path).to(device)
    trained_tokenizer = transformers.AutoTokenizer.from_pretrained(trained_tokenizer_path)

    executor = GenerationModelExecutor(
        trained_model,
        trained_tokenizer,
        batch_size,
        tokenizer_max_length=1024,
        sequence_max_length=1280,
    )

    outputs = executor.make_prediction(
        test_set=test_dataset,
        input_column=input_column
    )

    with open(save_path, 'w') as f:
        json.dump(outputs, f, cls=EnhancedJSONEncoder)

    return f"Model successfully executed on the test sets of the specified datasets and saved to {save_path}"

def evaluate_model(load_dirs, save_path, output_column, work_dir = '.', **kwargs):
    load_dirs = load_dirs.split(':')
    # load the datasets
    load_paths = [os.path.join(work_dir, load_dir) for load_dir in load_dirs]
    dataset_dicts = [datasets.load_from_disk(load_path) for load_path in load_paths]
    test_datasets = [dataset_dict["test"] for dataset_dict in dataset_dicts]
    test_dataset = datasets.concatenate_datasets(test_datasets)

    save_path = os.path.join(work_dir, save_path)
    with open(save_path, 'r') as f:
        outputs = json.load(f)
    outputs = [ModelOutput(**output) for output in outputs]

    evaluator = Seq2SeqEvaluator()
    metric_values = evaluator.evaluate_model(
        test_dataset,
        gt_column=output_column,
        predictions=outputs,
        encoder_model_name="xlm-roberta-base",
    )

    return f"Evaluation metrics: {metric_values}"

P2M_ACTIONS = [
    ActionInfo(
        name="Retrieve Model",
        description="Retrieve a suitable model based on a detailed description of the requirements. You can obtain the model given the name using the transformers.AutoModel.from_pretrained function.",
        usage={
            "instruction": "an instruction on how to generate the output from the input",
        },
        return_value="The observation will be a list of suitable models. You can choose one of them based on the requirements.",
        is_primitive=False,
        function=retrieve_model
    ),
]
# P2M_ACTIONS = [
#     ActionInfo(
#         name="Generate Dataset",
#         description="Generate a dataset based on an instruction and examples. You can load the dataset later from `save_dir` using the load_from_disk function of the HuggingFace datasets library.",
#         usage={
#             "instruction": "an instruction on how to generate the output from the input",
#             "examples": "examples of input-output pairs",
#             "save_dir": "directory to save the generated dataset dict to. We recommend saving to data/generated/",
#             "num_train": "number of examples to generate in the training set",
#             "num_valid": "number of examples to generate in the validation set",
#             "num_test": "number of examples to generate in the test set",
#         },
#         return_value="The observation will be a success message if the dataset was generated successfully. Otherwise, an error message will be returned.",
#         is_primitive=False,
#         function=generate_dataset
#     ),
#     ActionInfo(
#         name="Retrieve Dataset",
#         description="Retrieve a suitable dataset based on a detailed description of the requirements. You can load the dataset later from `save_dir` using the load_from_disk function of the HuggingFace datasets library.",
#         usage={
#             "instruction": "an instruction on how to generate the output from the input",
#             "save_dir": "directory to save the generated dataset dict to. We recommend saving to data/retrieved/",
#         },
#         return_value="The observation will be a success message if the dataset was retrieved successfully. Otherwise, an error message will be returned.",
#         is_primitive=False,
#         function=retrieve_dataset
#     ),
#     ActionInfo(
#         name="Retrieve Model",
#         description="Retrieve a suitable model based on a detailed description of the requirements. You can obtain the model given the name using the transformers.AutoModelForSeq2SeqLM.from_pretrained function.",
#         usage={
#             "instruction": "an instruction on how to generate the output from the input",
#         },
#         return_value="The observation will be a list of suitable models. You can choose one of them based on the requirements.",
#         is_primitive=False,
#         function=retrieve_model
#     ),
#     ActionInfo(
#         name="Process Dataset",
#         description="Process dataset based on a detailed description of the requirements. You can load the processed data later from `save_dirs` using the load_from_disk function of the HuggingFace datasets library. The input text will be in the `model_input` column and the output text will be in the `model_output` column.",
#         usage={
#             "instruction": "an instruction on how to generate the output from the input",
#             "load_dirs": "directories to load the dataset dicts from, separated by colons",
#             "save_dirs": "directories to save the processed dataset dicts to, separated by colons. The order should match the order of the loaded datasets. We recommend saving to data/processed/",
#         },
#         return_value="The observation will be a success message if the data was processed successfully. Otherwise, an error message will be returned.",
#         is_primitive=False,
#         function=process_dataset
#     ),
#     ActionInfo(
#         name="Train Model",
#         description="Train a Seq2Seq model from HuggingFace transformers library using the processed datasets and given hyperparameters.",
#         usage={
#             "model_name": "name of the model to train",
#             "load_dirs": "directories to load the dataset dicts from, separated by colons",
#             "result_dir": "directory to save the trained model and tokenizer to. We recommend using results/{trial_id}/. The trained model will be available as `{result_dir}/trained_model/` and the tokenizer will be available as `{result_dir}/trained_tokenizer/`.",
#             "epochs": "number of epochs to train the model for",
#             "batch_size": "batch size for training the model",
#             "warmup_steps": "number of warmup steps for the optimizer",
#             "weight_decay": "weight decay for the optimizer",
#             "learning_rate": "learning rate for the optimizer",
#         },
#         return_value="The observation will be a success message if the model was trained successfully. Otherwise, an error message will be returned.",
#         is_primitive=False,
#         function=train_model
#     ),
#     ActionInfo(
#         name="Execute Model on Test Set",
#         description="Execute a trained model on the test sets of specified dataset dicts.",
#         usage={
#             "result_dir": "directory where the trained model and tokenizer are saved",
#             "load_dirs": "directories to load the dataset dicts from, separated by colons",
#             "save_path": "file to save the results of the model execution in json format",
#             "batch_size": "batch size for executing the model",
#             "input_column": "column name of the input text",
#         },
#         return_value="The observation will be a success message if the model was executed successfully. Otherwise, an error message will be returned.",
#         is_primitive=False,
#         function=execute_model,
#     ),
#     ActionInfo(
#         name="Evaluate Model",
#         description="Evaluate a trained model on the test sets of specified dataset dicts.",
#         usage={
#             "load_dirs": "directories to load the dataset dicts from, separated by colons",
#             "save_path": "file to load the results of the model execution in json format",
#             "output_column": "column name of the output text",
#         },
#         return_value="The values for various evaluation metrics will be returned.",
#         is_primitive=False,
#         function=evaluate_model,
#     )
# ]