Spaces:

yuantao-infini-ai
/

demo_test

Runtime error

File size: 7,844 Bytes

cf1798b

"""
Clean chatbot arena battle log.

Usage:
python3 clean_battle_data.py --mode conv_release
"""
import argparse
import datetime
import json
import os
from pytz import timezone
import time

from tqdm import tqdm

from fastchat.serve.monitor.basic_stats import get_log_files, NUM_SERVERS
from fastchat.utils import detect_language


VOTES = ["tievote", "leftvote", "rightvote", "bothbad_vote"]
IDENTITY_WORDS = [
    "vicuna",
    "lmsys",
    "koala",
    "uc berkeley",
    "open assistant",
    "laion",
    "chatglm",
    "chatgpt",
    "openai",
    "anthropic",
    "claude",
    "bard",
    "palm",
    "lamda",
    "google",
    "llama",
    "NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.",
    "$MODERATION$ YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES.",
]

for i in range(len(IDENTITY_WORDS)):
    IDENTITY_WORDS[i] = IDENTITY_WORDS[i].lower()


def get_log_files(max_num_files=None):
    dates = []
    for month in range(4, 12):
        for day in range(1, 33):
            dates.append(f"2023-{month:02d}-{day:02d}")

    filenames = []
    for d in dates:
        for i in range(NUM_SERVERS):
            name = os.path.expanduser(f"~/fastchat_logs/server{i}/{d}-conv.json")
            if os.path.exists(name):
                filenames.append(name)
    max_num_files = max_num_files or len(filenames)
    filenames = filenames[-max_num_files:]
    return filenames


def remove_html(raw):
    if raw.startswith("<h3>"):
        return raw[raw.find(": ") + 2 : -len("</h3>\n")]
    return raw


def to_openai_format(messages):
    roles = ["user", "assistant"]
    ret = []
    for i, x in enumerate(messages):
        ret.append({"role": roles[i % 2], "content": x[1]})
    return ret


def replace_model_name(old_name):
    return (
        old_name.replace("bard", "palm-2")
        .replace("claude-v1", "claude-1")
        .replace("claude-instant-v1", "claude-instant-1")
        .replace("oasst-sft-1-pythia-12b", "oasst-pythia-12b")
    )


def clean_battle_data(log_files, exclude_model_names):
    data = []
    for filename in tqdm(log_files, desc="read files"):
        for retry in range(5):
            try:
                lines = open(filename).readlines()
                break
            except FileNotFoundError:
                time.sleep(2)

        for l in lines:
            row = json.loads(l)
            if row["type"] in VOTES:
                data.append(row)

    convert_type = {
        "leftvote": "model_a",
        "rightvote": "model_b",
        "tievote": "tie",
        "bothbad_vote": "tie (bothbad)",
    }

    all_models = set()
    all_ips = dict()
    ct_anony = 0
    ct_invalid = 0
    ct_leaked_identity = 0
    battles = []
    for row in data:
        if row["models"][0] is None or row["models"][1] is None:
            continue

        # Resolve model names
        models_public = [remove_html(row["models"][0]), remove_html(row["models"][1])]
        if "model_name" in row["states"][0]:
            models_hidden = [
                row["states"][0]["model_name"],
                row["states"][1]["model_name"],
            ]
            if models_hidden[0] is None:
                models_hidden = models_public
        else:
            models_hidden = models_public

        if (models_public[0] == "" and models_public[1] != "") or (
            models_public[1] == "" and models_public[0] != ""
        ):
            ct_invalid += 1
            continue

        if models_public[0] == "" or models_public[0] == "Model A":
            anony = True
            models = models_hidden
            ct_anony += 1
        else:
            anony = False
            models = models_public
            if not models_public == models_hidden:
                ct_invalid += 1
                continue

        # Detect langauge
        state = row["states"][0]
        if state["offset"] >= len(state["messages"]):
            ct_invalid += 1
            continue
        lang_code = detect_language(state["messages"][state["offset"]][1])

        # Drop conversations if the model names are leaked
        leaked_identity = False
        messages = ""
        for i in range(2):
            state = row["states"][i]
            for role, msg in state["messages"][state["offset"] :]:
                if msg:
                    messages += msg.lower()
        for word in IDENTITY_WORDS:
            if word in messages:
                leaked_identity = True
                break

        if leaked_identity:
            ct_leaked_identity += 1
            continue

        # Replace bard with palm
        models = [replace_model_name(m) for m in models]

        # Exclude certain models
        if any(x in exclude_model_names for x in models):
            ct_invalid += 1
            continue

        question_id = row["states"][0]["conv_id"]
        conversation_a = to_openai_format(
            row["states"][0]["messages"][row["states"][0]["offset"] :]
        )
        conversation_b = to_openai_format(
            row["states"][1]["messages"][row["states"][1]["offset"] :]
        )

        ip = row["ip"]
        if ip not in all_ips:
            all_ips[ip] = len(all_ips)
        user_id = all_ips[ip]

        # Save the results
        battles.append(
            dict(
                question_id=question_id,
                model_a=models[0],
                model_b=models[1],
                winner=convert_type[row["type"]],
                judge=f"arena_user_{user_id}",
                conversation_a=conversation_a,
                conversation_b=conversation_b,
                turn=len(conversation_a) // 2,
                anony=anony,
                language=lang_code,
                tstamp=row["tstamp"],
            )
        )

        all_models.update(models_hidden)
    battles.sort(key=lambda x: x["tstamp"])
    last_updated_tstamp = battles[-1]["tstamp"]

    last_updated_datetime = datetime.datetime.fromtimestamp(
        last_updated_tstamp, tz=timezone("US/Pacific")
    ).strftime("%Y-%m-%d %H:%M:%S %Z")

    print(
        f"#votes: {len(data)}, #invalid votes: {ct_invalid}, "
        f"#leaked_identity: {ct_leaked_identity}"
    )
    print(f"#battles: {len(battles)}, #anony: {ct_anony}")
    print(f"#models: {len(all_models)}, {all_models}")
    print(f"last-updated: {last_updated_datetime}")

    return battles


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--max-num-files", type=int)
    parser.add_argument(
        "--mode", type=str, choices=["simple", "conv_release"], default="simple"
    )
    parser.add_argument("--exclude-model-names", type=str, nargs="+")
    args = parser.parse_args()

    log_files = get_log_files(args.max_num_files)
    battles = clean_battle_data(log_files, args.exclude_model_names or [])
    last_updated_tstamp = battles[-1]["tstamp"]
    cutoff_date = datetime.datetime.fromtimestamp(
        last_updated_tstamp, tz=timezone("US/Pacific")
    ).strftime("%Y%m%d")

    if args.mode == "simple":
        for x in battles:
            for key in [
                "conversation_a",
                "conversation_b",
                "question_id",
            ]:
                del x[key]
        print("Samples:")
        for i in range(4):
            print(battles[i])
        output = f"clean_battle_{cutoff_date}.json"
    elif args.mode == "conv_release":
        new_battles = []
        for x in battles:
            if not x["anony"]:
                continue
            for key in []:
                del x[key]
            new_battles.append(x)
        battles = new_battles
        output = f"clean_battle_conv_{cutoff_date}.json"

    with open(output, "w") as fout:
        json.dump(battles, fout, indent=2, ensure_ascii=False)
    print(f"Write cleaned data to {output}")