Spaces:

THUDM
/

MotionBench

Running

App Files Files Community

huangshiyu commited on Jan 4

Commit

6169a19

1 Parent(s): 3c598b1

update

Browse files

Files changed (4) hide show

app.py +22 -6
compute_accuracy.py +47 -0
constants.py +13 -0
eval_final_results.py +11 -0

app.py CHANGED Viewed

@@ -3,9 +3,12 @@ __all__ = ['block', 'make_clickable_model', 'make_clickable_user', 'get_submissi
 import gradio as gr
 import pandas as pd
 import json
 from constants import *
 from huggingface_hub import Repository
 HF_TOKEN = os.environ.get("HF_TOKEN")
@@ -37,11 +40,19 @@ def add_new_eval(
     if input_file is None:
         return "Error! Empty file!"
-    upload_data = json.loads(input_file)
     submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN,
                                  repo_type="dataset",git_user="auto-uploader",git_email="uploader@163.com")
     submission_repo.git_pull()
     csv_data = pd.read_csv(CSV_DIR)
     if LLM_type == 'Other':
         LLM_name = LLM_name_textbox
@@ -72,11 +83,16 @@ def add_new_eval(
         model_date,
         model_link
     ]
-    for key in TASK_INFO:
-        if key in upload_data:
-            new_data.append(round(100*upload_data[key],1))
-        else:
-            new_data.append(0)
     # print(new_data)
     # print(csv_data.loc[col-1])
     csv_data.loc[col] = new_data

 import gradio as gr
 import pandas as pd
 import json
+import traceback
 from constants import *
 from huggingface_hub import Repository
+from eval_final_results import eval_final
 HF_TOKEN = os.environ.get("HF_TOKEN")
     if input_file is None:
         return "Error! Empty file!"
+    # upload_data = json.loads(input_file)
     submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN,
                                  repo_type="dataset",git_user="auto-uploader",git_email="uploader@163.com")
     submission_repo.git_pull()
     csv_data = pd.read_csv(CSV_DIR)
+    try:
+        upload_data = eval_final(test_answer_file,dev_answer_file, input_file)
+    except:
+        error_message = traceback.format_exc()
+        print("Error:", error_message)
+        return
     if LLM_type == 'Other':
         LLM_name = LLM_name_textbox
         model_date,
         model_link
     ]
+    try:
+        for key in TASK_INFO:
+            if key in upload_data:
+                new_data.append(round(100*upload_data[key_map[key]],1))
+            else:
+                new_data.append(0)
+    except:
+        error_message = traceback.format_exc()
+        print("Error:", error_message)
+        return
     # print(new_data)
     # print(csv_data.loc[col-1])
     csv_data.loc[col] = new_data

compute_accuracy.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import json
+import jsonlines
+from collections import defaultdict
+def compute_accuracy(answer_file: str, video_meta_file: str):
+    total_qa_num = 0
+    total_answered_num = 0
+    right_num = 0
+    category_right = defaultdict(float)
+    category_total = defaultdict(float)
+    category_acc = defaultdict(float)
+    with open(answer_file) as f:
+        model_answers = json.load(f)
+    with jsonlines.open(video_meta_file) as reader:
+        video_meta = list(reader)
+        for meta_data in video_meta:
+            for qa in meta_data['qa']:
+                uid = str(qa["uid"])
+                if uid in model_answers:
+                    total_answered_num += 1
+                    model_answer = model_answers[uid]
+                    meta_data['question_type'] = [meta_data['question_type']]
+                    if qa["answer"] == "NA":
+                        continue
+                    for category in meta_data['question_type']:
+                        category_total[category] += 1
+                        if model_answer == qa["answer"]:
+                            category_right[category] += 1
+                    if model_answer == qa["answer"]:
+                        right_num += 1
+                total_qa_num += 1
+    for key in category_total:
+        category_acc[key] = category_right[key] / category_total[key]
+    acc = float(right_num) / total_qa_num
+    answered_acc = float(right_num) / total_answered_num
+    category_acc.update({"acc": acc, "answered_acc": answered_acc, "total_qa_num": total_qa_num,
+                         "total_answered_num": total_answered_num, "right_num": right_num})
+    return category_acc

constants.py CHANGED Viewed

@@ -5,12 +5,25 @@ MODEL_INFO = ["Model", "Language Model", "Date"]
 TASK_INFO = ["Dev Avg", "Test Avg", "MR", "LM", "CM", "MO", "AO", "RC"]
 AVG_INFO = ["Dev Avg", "Test Avg", "MR", "LM", "CM", "MO", "AO", "RC"]
 DATA_TITILE_TYPE = ['markdown', 'markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number',
                     'number', 'number']
 SUBMISSION_NAME = "MotionBench_submission"
 SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/THUDM/", SUBMISSION_NAME)
 CSV_DIR = "./MotionBench_submission/result.csv"
 COLUMN_NAMES = MODEL_INFO + TASK_INFO

 TASK_INFO = ["Dev Avg", "Test Avg", "MR", "LM", "CM", "MO", "AO", "RC"]
 AVG_INFO = ["Dev Avg", "Test Avg", "MR", "LM", "CM", "MO", "AO", "RC"]
+key_map = {
+    "Dev Avg": "dev avg",
+    "Test Avg": "test avg",
+    "MR": "Motion Recognition",
+    "LM": "Location-related Motion",
+    "CM": "Camera Motion",
+    "MO": "Motion-related Objects",
+    "AO": "Action Order",
+    "RC": "Repetition Count"
+}
 DATA_TITILE_TYPE = ['markdown', 'markdown', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number',
                     'number', 'number']
 SUBMISSION_NAME = "MotionBench_submission"
 SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/THUDM/", SUBMISSION_NAME)
 CSV_DIR = "./MotionBench_submission/result.csv"
+test_answer_file = "./MotionBench_submission/test_ans_video_info.meta.jsonl"
+dev_answer_file = "./MotionBench_submission/dev_ans_video_info.meta.jsonl"
 COLUMN_NAMES = MODEL_INFO + TASK_INFO

eval_final_results.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from compute_accuracy import compute_accuracy
+def eval_final(test_metafile,dev_metafile,to_eval):
+    print("Computing accuracy...")
+    result_test = compute_accuracy(to_eval, test_metafile)
+    result_dev = compute_accuracy(to_eval, dev_metafile)
+    output = {"dev avg": result_dev['answered_acc'],
+              "test avg": result_test['answered_acc'],
+              **result_test}