machine-translation

Build error

File size: 18,584 Bytes

import os
import re
import glob
import pandas as pd
import evaluate
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from tqdm import tqdm
from eval_modules.calc_repetitions import *
from llm_toolkit.llm_utils import load_tokenizer, print_row_details

print(f"loading {__file__}")

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
accuracy = evaluate.load("accuracy")


def extract_answer(text, debug=False):
    if text:
        # Remove the begin and end tokens
        text = re.sub(
            r".*?(assistant|\[/INST\]).+?\b", "", text, flags=re.DOTALL | re.MULTILINE
        )
        if debug:
            print("--------\nstep 1:", text)

        text = re.sub(r"<.+?>.*", "", text, flags=re.DOTALL | re.MULTILINE)
        if debug:
            print("--------\nstep 2:", text)

        text = re.sub(
            r".*?end_header_id\|>\n\n", "", text, flags=re.DOTALL | re.MULTILINE
        )
        if debug:
            print("--------\nstep 3:", text)

    return text


def calc_metrics(references, predictions, debug=False):
    assert len(references) == len(
        predictions
    ), f"lengths are difference: {len(references)} != {len(predictions)}"

    predictions = [extract_answer(text) for text in predictions]
    results = {}

    results["meteor"] = meteor.compute(predictions=predictions, references=references)[
        "meteor"
    ]

    results["bleu_scores"] = bleu.compute(
        predictions=predictions, references=references, max_order=4
    )
    results["rouge_scores"] = rouge.compute(
        predictions=predictions, references=references
    )

    correct = [1 if ref == pred else 0 for ref, pred in zip(references, predictions)]
    accuracy = sum(correct) / len(references)

    results["accuracy"] = accuracy
    if debug:
        correct_ids = [i for i, c in enumerate(correct) if c == 1]
        results["correct_ids"] = correct_ids

    return results


def save_results(model_name, results_path, dataset, predictions, debug=False):
    if not os.path.exists(results_path):
        # Get the directory part of the file path
        dir_path = os.path.dirname(results_path)

        # Create all directories in the path (if they don't exist)
        os.makedirs(dir_path, exist_ok=True)
        df = dataset.to_pandas()
        df.drop(columns=["text", "prompt"], inplace=True, errors="ignore")
    else:
        df = pd.read_csv(results_path, on_bad_lines="warn")

    df[model_name] = predictions

    if debug:
        print(df.head(1))

    df.to_csv(results_path, index=False)


system_prompt = "You are a helpful assistant that translates Chinese to English."


def get_few_shot_prompt(dataset, num_shots=5):
    translation_prompt = "You will be given a Chinese sentence to translate. If it is an incomplete sentence, or if you are unsure about the meaning, simply copy the input text as your output. Do not output any additional sentence such as explanation or reasoning.\n\n"
    if num_shots > 0:
        example_translations = "Example Translations:\n"
        for i in range(num_shots):
            example_translations += f"Chinese: {dataset[i]['chinese']}\n"
            example_translations += f"English: {dataset[i]['english']}\n"
        translation_prompt = translation_prompt + example_translations + "\n"

    translation_prompt = translation_prompt + "Chinese: {input}\nEnglish:"
    return translation_prompt


def load_translation_dataset(data_path, tokenizer=None, num_shots=5):
    train_data_file = data_path.replace(".tsv", "-train.tsv")
    test_data_file = data_path.replace(".tsv", "-test.tsv")

    if not os.path.exists(train_data_file):
        print("generating train/test data files")
        dataset = load_dataset(
            "csv", data_files=data_path, delimiter="\t", split="train"
        )
        print(len(dataset))
        dataset = dataset.filter(lambda x: x["chinese"] and x["english"])

        datasets = dataset.train_test_split(test_size=0.2)
        print(len(dataset))

        # Convert to pandas DataFrame
        train_df = pd.DataFrame(datasets["train"])
        test_df = pd.DataFrame(datasets["test"])

        # Save to TSV
        train_df.to_csv(train_data_file, sep="\t", index=False)
        test_df.to_csv(test_data_file, sep="\t", index=False)

    print("loading train/test data files")
    datasets = load_dataset(
        "csv",
        data_files={"train": train_data_file, "test": test_data_file},
        delimiter="\t",
    )

    if tokenizer:
        translation_prompt = get_few_shot_prompt(datasets["train"], num_shots)

        def formatting_prompts_func(examples):
            inputs = examples["chinese"]
            outputs = examples["english"]

            messages = [
                {
                    "role": "system",
                    "content": system_prompt,
                },
                None,
            ]

            model_name = os.getenv("MODEL_NAME")

            # if "mistral" in model_name.lower():
            # messages = messages[1:]

            texts = []
            prompts = []
            for input, output in zip(inputs, outputs):
                prompt = translation_prompt.format(input=input)
                messages[-1] = {"role": "user", "content": prompt}

                prompt = tokenizer.apply_chat_template(
                    messages, tokenize=False, add_generation_prompt=True
                )
                prompts.append(prompt)
                texts.append(prompt + output + tokenizer.eos_token)
            return {"text": texts, "prompt": prompts}

        datasets = datasets.map(
            formatting_prompts_func,
            batched=True,
        )

    print(datasets)
    return datasets


def count_entries_with_max_tokens(entries, max_tokens):
    """
    Count the number of entries with the max output tokens or more.

    Parameters:
    entries (list of int): List of token counts for each entry.
    max_tokens (int): The maximum token threshold.

    Returns:
    int: The number of entries with token counts greater than or equal to max_tokens.
    """
    count = 0
    for tokens in entries:
        if tokens >= max_tokens:
            count += 1
    return count


def detect_repetition_scores(row, col, debug=False):
    # print(f"row: {row}")
    newline_score, repetition_score, total_repetitions = detect_repetitions(
        row[col], debug=debug
    )
    newline_score -= row["ground_truth_ews_score"]
    repetition_score -= row["ground_truth_repetition_score"]
    total_repetitions -= row["ground_truth_total_repetitions"]

    return pd.Series(
        [
            newline_score if newline_score > 0 else 0,
            repetition_score if repetition_score > 0 else 0,
            total_repetitions if total_repetitions > 0 else 0,
        ]
    )


def get_metrics(df, max_output_tokens=2048, variant="rpp"):
    metrics_df = pd.DataFrame(df.columns.T)[2:]
    metrics_df.rename(columns={0: "model"}, inplace=True)
    metrics_df[variant] = metrics_df["model"].apply(
        lambda x: x.split(f"{variant}-")[-1]
    )
    metrics_df["model"] = metrics_df["model"].apply(
        lambda x: x.split(f"/{variant}-")[0]
    )
    metrics_df.reset_index(inplace=True)
    metrics_df = metrics_df.drop(columns=["index"])

    tokenizers = {
        model: load_tokenizer(model) for model in metrics_df["model"].unique()
    }

    meteor = []
    bleu_1 = []
    rouge_l = []
    ews_score = []
    repetition_score = []
    total_repetitions = []
    num_max_output_tokens = []
    columns = df.columns[2:]

    df[
        [
            "ground_truth_ews_score",
            "ground_truth_repetition_score",
            "ground_truth_total_repetitions",
        ]
    ] = df["english"].apply(detect_scores)

    for col in columns:
        metrics = calc_metrics(df["english"], df[col], debug=True)
        print(f"{col}: {metrics}")

        meteor.append(metrics["meteor"])
        bleu_1.append(metrics["bleu_scores"]["bleu"])
        rouge_l.append(metrics["rouge_scores"]["rougeL"])

        df[["ews_score", "repetition_score", "total_repetitions"]] = df.apply(
            lambda x: detect_repetition_scores(x, col), axis=1
        )
        ews_score.append(df["ews_score"].mean())
        repetition_score.append(df["repetition_score"].mean())
        total_repetitions.append(df["total_repetitions"].mean())

        model = col.split(f"/{variant}")[0]

        new_col = f"ground_truth_tokens-{model}"
        df[new_col] = df["english"].apply(
            lambda x: len(tokenizers[model](x)["input_ids"])
        )

        new_col = f"output_tokens-{col}"
        df[new_col] = df[col].apply(lambda x: len(tokenizers[model](x)["input_ids"]))

        num_max_output_tokens.append(
            count_entries_with_max_tokens(df[new_col], max_output_tokens)
        )

    metrics_df["meteor"] = meteor
    metrics_df["bleu_1"] = bleu_1
    metrics_df["rouge_l"] = rouge_l
    metrics_df["ews_score"] = ews_score
    metrics_df["repetition_score"] = repetition_score
    metrics_df["total_repetitions"] = total_repetitions
    metrics_df["rap"] = metrics_df.apply(
        lambda x: x["meteor"] / math.log10(10 + x["total_repetitions"]), axis=1
    )

    metrics_df["num_max_output_tokens"] = num_max_output_tokens

    return metrics_df


def analyze_translation_results(df, col, max_new_tokens=300, repetition_threshold=100):
    df[["ews_score", "repetition_score", "total_repetitions"]] = df.apply(
        lambda x: detect_repetition_scores(x, col), axis=1
    )
    rows = df.query(f"total_repetitions > {repetition_threshold}")
    print(
        f"*** Found {len(rows)} rows with total_repetitions > {repetition_threshold} for {col}"
    )

    for i in range(len(rows)):
        row = rows.iloc[i]
        print(row["chinese"])
        print("=" * 80)
        print(row["english"])
        print("=" * 80)
        output = row[col]
        print(output)
        print("=" * 80)
        detect_repetitions(output, debug=True)

    output_tokens = f"output_tokens-{col}"
    df2 = df[df[output_tokens] >= max_new_tokens][
        ["chinese", "english", col, output_tokens]
    ]

    print(
        f"\n*** Found {len(df2)} rows with output_tokens >= {max_new_tokens} for {col}"
    )
    print_row_details(df2, range(len(df2)))


def plot_metrics(metrics_df, figsize=(14, 5), ylim=(0, 0.44)):
    plt.figure(figsize=figsize)
    df_melted = pd.melt(
        metrics_df, id_vars="model", value_vars=["meteor", "bleu_1", "rouge_l"]
    )

    barplot = sns.barplot(x="variable", y="value", hue="model", data=df_melted)

    # Set different hatches for each model
    hatches = ["/", "\\", "|", "-", "+", "x", "o", "O", ".", "*", "//", "\\\\"]

    # Create a dictionary to map models to hatches
    model_hatches = {
        model: hatches[i % len(hatches)]
        for i, model in enumerate(metrics_df["model"].unique())
    }

    # Apply hatches based on the model
    num_vars = len(df_melted["variable"].unique())
    for i, bar in enumerate(barplot.patches):
        model = df_melted["model"].iloc[i // num_vars]
        bar.set_hatch(model_hatches[model])

    # Manually update legend to match the bar hatches
    handles, labels = barplot.get_legend_handles_labels()
    for handle, model in zip(handles, metrics_df["model"].unique()):
        handle.set_hatch(model_hatches[model])

    barplot.set_xticklabels(["METEOR", "BLEU-1", "ROUGE-L"])
    for p in barplot.patches:
        if p.get_height() == 0:
            continue
        barplot.annotate(
            f"{p.get_height():.2f}",
            (p.get_x() + p.get_width() / 2.0, p.get_height()),
            ha="center",
            va="center",
            xytext=(0, 10),
            textcoords="offset points",
        )

    barplot.set(ylim=ylim, ylabel="Scores", xlabel="Metrics")
    plt.legend(bbox_to_anchor=(0.5, -0.1), loc="upper center")
    plt.show()


def plot_times(perf_df, ylim=0.421):
    # Adjusted code to put "train-time" bars in red at the bottom

    fig, ax1 = plt.subplots(figsize=(12, 10))

    color_train = "tab:red"
    color_eval = "orange"
    ax1.set_xlabel("Models")
    ax1.set_ylabel("Time (mins)")
    ax1.set_xticks(range(len(perf_df["model"])))  # Set x-ticks positions
    ax1.set_xticklabels(perf_df["model"], rotation=90)

    # Plot "train-time" first so it's at the bottom
    ax1.bar(
        perf_df["model"],
        perf_df["train-time(mins)"],
        color=color_train,
        label="train-time",
    )

    # Then, plot "eval-time" on top of "train-time"
    ax1.bar(
        perf_df["model"],
        perf_df["eval-time(mins)"],
        bottom=perf_df["train-time(mins)"],
        color=color_eval,
        label="eval-time",
    )

    ax1.tick_params(axis="y")
    ax1.legend(loc="upper left")

    if "meteor" in perf_df.columns:
        ax2 = ax1.twinx()
        color_meteor = "tab:blue"
        ax2.set_ylabel("METEOR", color=color_meteor)
        ax2.plot(
            perf_df["model"],
            perf_df["meteor"],
            color=color_meteor,
            marker="o",
            label="meteor",
        )
        ax2.tick_params(axis="y", labelcolor=color_meteor)
        ax2.legend(loc="upper right")
        ax2.set_ylim(ax2.get_ylim()[0], ylim)

    # Show numbers in bars
    for p in ax1.patches:
        height = p.get_height()
        if height == 0:  # Skip bars with height 0
            continue
        ax1.annotate(
            f"{height:.2f}",
            (p.get_x() + p.get_width() / 2.0, p.get_y() + height),
            ha="center",
            va="center",
            xytext=(0, -10),
            textcoords="offset points",
        )

    fig.tight_layout()
    plt.show()


def translate_via_openai(
    text, translation_prompt, max_tokens=None, model="gpt-4o-mini", base_url=None
):
    llm = ChatOpenAI(
        model=model,
        temperature=0,
        max_tokens=max_tokens,
        timeout=None,
        max_retries=2,
        base_url=base_url,
    )

    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "You are a helpful assistant that translates Chinese to English.",
            ),
            (
                "human",
                translation_prompt,
            ),
        ]
    )

    chain = prompt | llm
    response = chain.invoke(
        {
            "input": text,
        }
    )

    return response.content


def eval_openai(num_shots, datasets, model="gpt-4o-mini", max_new_tokens=300):
    translation_prompt = get_few_shot_prompt(datasets["train"], num_shots=num_shots)
    eval_dataset = datasets["test"]
    total = len(eval_dataset)
    predictions = []

    for i in tqdm(range(total)):
        output = translate_via_openai(
            eval_dataset["chinese"][i],
            translation_prompt,
            model=model,
            max_tokens=max_new_tokens,
        )
        predictions.append(output)

    return predictions


def convert_time_to_seconds(time_str):
    # print(f"converting time_str: {time_str}")
    # Split the time string into its components
    time_parts = list(map(int, time_str.split(":")))

    # Initialize total minutes
    total_seconds = 0

    # Calculate total minutes based on the number of parts
    if len(time_parts) == 3:  # HH:MM:SS
        hours, minutes, seconds = time_parts
        total_seconds = hours * 3600 + minutes * 60 + seconds
    elif len(time_parts) == 2:  # MM:SS
        minutes, seconds = time_parts
        total_seconds = minutes * 60 + seconds
    elif len(time_parts) == 1:  # SS
        seconds = time_parts[0]
        total_seconds = seconds

    return total_seconds


time_pattern = re.compile(r"\[(.{5,10})<00:00")
metrics_pattern = re.compile(r"(.*)/shots-(.*) metrics:")


def process_log_file(log_file, total_entries):
    model = []
    shots = []
    eval_time = []

    with open(log_file, "r") as f:
        try:
            for line in f:
                matches = time_pattern.search(line)
                if matches:
                    time_pattern_matches = matches
                else:
                    matches = metrics_pattern.search(line)
                    if matches:
                        metrics_pattern_matches = matches
                        groups = metrics_pattern_matches.groups()

                        model.append(groups[0])
                        shots.append(groups[1])

                        groups = time_pattern_matches.groups()
                        time_str = groups[0]
                        eval_time.append(
                            convert_time_to_seconds(time_str) / total_entries
                        )
        except Exception as e:
            print(f"Error processing log file: {log_file}")
            print(e)

    df = pd.DataFrame(
        {
            "model": model,
            "shots": shots,
            "eval_time": eval_time,
        }
    )
    return df


def load_eval_times(logs_folder, total_entries=1133):
    # Get a list of all files in the logs folder
    log_files = glob.glob(os.path.join(logs_folder, "*"))
    log_files.sort()

    time_df = pd.DataFrame({"model": [], "shots": [], "eval_time": []})

    for log_file in log_files:
        print(f"Loading content of {log_file}")
        df = process_log_file(log_file, total_entries=total_entries)
        time_df = pd.concat([time_df, df], ignore_index=True)

    time_df["shots"] = time_df["shots"].apply(lambda x: int(x))
    return time_df


def load_alpaca_data(data_path):
    alpaca_data_path = "data/alpaca_mac.json"

    if os.path.exists(alpaca_data_path):
        print("loading existing data from:", alpaca_data_path)
        data = pd.read_json(alpaca_data_path, orient="records", lines=False)
        return data

    datasets = load_translation_dataset(data_path)
    prompt_template = get_few_shot_prompt(datasets["train"], num_shots=0)

    df_train = datasets["train"].to_pandas()
    df_train["instruction"] = df_train.apply(
        lambda x: prompt_template.format(input=x["chinese"]), axis=1
    )

    df_alpaca = pd.DataFrame(
        {
            "system": [system_prompt] * len(df_train),
            "instruction": df_train["instruction"].to_list(),
            "input": [""] * len(df_train),
            "output": df_train["english"].to_list(),
        }
    )

    df_alpaca.to_json(alpaca_data_path, orient="records", lines=False, indent=2)

    return df_alpaca