Spaces:

akshit-g
/

SeeForMe-Video

Sleeping

App Files Files Community

SeeForMe-Video / moondream /eval /naturalbench.py

akshit-g

add : app.py

af62a64 2 months ago

raw

history blame

2.16 kB

	from datasets import load_dataset
	from tqdm import tqdm
	from transformers import AutoModelForCausalLM, AutoTokenizer

	from ..hf import detect_device

	MODEL_ID = "vikhyatk/moondream2"
	DEVICE, DTYPE = detect_device()

	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
	moondream = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	attn_implementation="flash_attention_2",
	torch_dtype=DTYPE,
	device_map={"": DEVICE},
	)
	moondream.eval()

	# Yes, the benchmark test set is stored in the 'train' split...
	dataset = load_dataset("BaiqiL/NaturalBench", split="train")

	acc = []
	q_acc = []
	i_acc = []
	g_acc = []

	for row in tqdm(dataset):
	if row["Question_Type"] == "yes_no":
	suffix = " Answer yes or no."
	else:
	suffix = ""

	answers = moondream.batch_answer(
	images=[row["Image_0"], row["Image_1"], row["Image_0"], row["Image_1"]],
	prompts=[
	row["Question_0"] + suffix,
	row["Question_0"] + suffix,
	row["Question_1"] + suffix,
	row["Question_1"] + suffix,
	],
	tokenizer=tokenizer,
	)

	expected = [
	row["Image_0_Question_0"],
	row["Image_1_Question_0"],
	row["Image_0_Question_1"],
	row["Image_1_Question_1"],
	]

	acc.append(answers[0] == expected[0])
	acc.append(answers[1] == expected[1])
	acc.append(answers[2] == expected[2])
	acc.append(answers[3] == expected[3])

	i_acc.append(answers[0] == expected[0] and answers[2] == expected[2])
	i_acc.append(answers[1] == expected[1] and answers[3] == expected[3])

	q_acc.append(answers[0] == expected[0] and answers[1] == expected[1])
	q_acc.append(answers[2] == expected[2] and answers[3] == expected[3])

	g_acc.append(
	answers[0] == expected[0]
	and answers[1] == expected[1]
	and answers[2] == expected[2]
	and answers[3] == expected[3]
	)


	print("Overall Accuracy:", sum(acc) / len(acc))
	print("Image Accuracy:", sum(i_acc) / len(i_acc))
	print("Question Accuracy:", sum(q_acc) / len(q_acc))
	print("Group Accuracy:", sum(g_acc) / len(g_acc))