JoshuaChak
/

bmodel-qwen1.5-1.8b

Model card Files Files and versions Community

JoshuaChak commited on May 26, 2024

Commit

ddb8425

verified ·

1 Parent(s): 5d6e5a6

Upload folder using huggingface_hub

Browse files

Files changed (9) hide show

C-Eval/README.md +19 -0
C-Eval/evaluate_chatglm3.py +160 -0
C-Eval/subject_mapping.json +262 -0
Hisence/src/categories.py +147 -0
Hisence/src/chatglm3.py +187 -0
Hisence/src/mp_utils.py +241 -0
Hisence/src/qwen1.5.py +188 -0
MMLU/README.md +7 -0
MMLU/evaluate_chatglm3.py +95 -0

C-Eval/README.md ADDED Viewed

	@@ -0,0 +1,19 @@

+# 运行指南
+## 项目编译
+请参考LLM-TPU/models/ChatGLM3/eval-demo/README.md进行项目编译
+## 搭建数据环境
+下载并准备数据
+```
+mkdir ceval-exam
+cd ceval-exam
+wget https://huggingface.co/datasets/ceval/ceval-exam/resolve/main/ceval-exam.zip
+unzip ceval-exam
+```
+## 运行评测例程
+```
+export PYTHONPATH=../../
+python evaluate_chatglm3.py --devid 10 --model_path ../../models/ChatGLM3/compile/chatglm3-6b_int4_1dev.bmodel --tokenizer_path $PATH_TO_TOKENIZER --eval_mode fast
+```

C-Eval/evaluate_chatglm3.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+import json
+import argparse
+from tqdm import tqdm
+import pandas as pd
+from models.ChatGLM3.eval_demo import chat
+from transformers import AutoTokenizer
+import re
+def load_json(json_path):
+    with open(json_path, 'r') as f:
+        res = json.load(f)
+    return res
+def dump_json(dic, json_path):
+    with open(json_path, 'w') as json_file:
+        json.dump(dic, json_file)
+    return
+def construct_prompt(subject, dev_row, test_row, example_num):
+    sys_pattern = "以下是中国关于{}考试的单项选择题，请选出其中的正确答案。\n\n"
+    question_pattern = "{}\nA. {}\nB. {}\nC. {}\nD. {}\n答案：{}\n"
+    test_pattern = "{}\nA. {}\nB. {}\nC. {}\nD. {}\n答案："
+    res = sys_pattern.format(subject)
+    for i in range(example_num):
+        res = res + question_pattern.format(dev_row[i].question, dev_row[i].A, dev_row[i].B, dev_row[i].C, dev_row[i].D, dev_row[i].anwser)
+    res = res + test_pattern.format(test_row.question, test_row.A, test_row.B, test_row.C, test_row.D)
+    return res
+def bmodel_infer(model, tokenizer, prompt, history):
+    tokens = tokenizer.build_chat_input(prompt, history=history)['input_ids'].tolist()[0]
+    token = model.generate(tokens, tokenizer.eos_token_id)
+    answer_cur = tokenizer.decode(token)
+    return answer_cur
+def bmodel_generate_option(model, tokenizer, prompt, history):
+    tokens = tokenizer.build_chat_input(prompt, history=history)['input_ids'].tolist()[0]
+    # import pdb; pdb.set_trace()
+    token = model.predict_option(tokens)
+    # import pdb;pdb.set_trace()
+    return token
+def extract_cot_answer(self, line, gen_ans):
+    m = re.findall(r'所以答案是(.+?)。', gen_ans, re.M)
+    if len(m) > 0 and m[-1] in self.choices:
+        return m[-1], True
+    answer_patterns = [
+        r'([ABCD])是正确的',
+        r'选项([ABCD])正确',
+        r'答案为([ABCD])',
+        r'答案是([ABCD])',
+        r'答案([ABCD])',
+        r'选择([ABCD])',
+        r'答案：([ABCD])',
+        r'选择答案([ABCD])'
+    ]
+    # RE extraction
+    for answer_pattern in answer_patterns:
+        m = re.search(answer_pattern, gen_ans, re.M)
+        if m:
+            answer = m.group(1)
+            return answer, False
+    # only containing one choice-character
+    m = re.findall(r'[ABCD]', gen_ans, re.M)
+    if len(m) == 1:
+        answer = m[0]
+        return answer, False
+    answer_word_counter = 0
+    # only containing one choice-context
+    for c in self.choices:
+        if str(line[f'{c}']) in gen_ans:
+            answer = c
+            answer_word_counter += 1
+    if answer_word_counter == 1:
+        return answer, False
+    return '-', False
+def main(args):
+    # 1. define params
+    example_num = 0
+    dev_path = "ceval-exam/dev"
+    test_path = "ceval-exam/test"
+    if "int8" in args.model_path:
+        submit_path ="submission_int8.json"
+    elif "int4" in args.model_path:
+        submit_path ="submission_int4.json"
+    elif "f16" in args.model_path:
+        submit_path ="submission_f16.json"
+    subject_path = "subject_mapping.json"
+    subject_map = load_json(subject_path)
+    # 2. create engine
+    model = chat.ChatGLM()
+    devices = [int(d) for d in args.devid.split(",")]
+    model.init(devices, args.model_path)
+    model.temperature = args.temperature
+    model.top_p = args.top_p
+    model.repeat_penalty = args.repeat_penalty
+    model.repeat_last_n = args.repeat_last_n
+    model.max_new_tokens = args.max_new_tokens
+    model.generation_mode = args.generation_mode
+    model.prompt_mode = args.prompt_mode
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, trust_remote_code=True)
+    # 3. inference
+    res = {}
+    subject_num = len(os.listdir(test_path))
+    print(f"Subject numbers: {subject_num}")
+    count = 0
+    for dev_csv_file, test_csv_file in zip(os.listdir(dev_path), os.listdir(test_path)):
+        count = count + 1
+        dev_csv_path = os.path.join(dev_path, dev_csv_file)
+        test_csv_path = os.path.join(test_path, test_csv_file)
+        dev_df = pd.read_csv(dev_csv_path)
+        test_df = pd.read_csv(test_csv_path)
+        subject = test_csv_file.replace("_test.csv", "")
+        subject_zh = subject_map[subject][1]
+        dev_row = [dev_df.loc[i] for i in range(example_num)]
+        subject_dict = {}
+        print("======================================")
+        print("======================================")
+        print("Current subject:", subject)
+        print("subject no: ", count)
+        print("======================================")
+        print("======================================")
+        for i in tqdm(range(len(test_df))):
+            prompt = construct_prompt(subject_zh, dev_row, test_df.loc[i], example_num)
+            print("")
+            print("prompt:", prompt)
+            if args.eval_mode == "fast":
+                pred = bmodel_generate_option(model, tokenizer, prompt, history = [])
+            else:
+                pred = bmodel_infer(model, tokenizer, prompt, history = [])
+            print("prediction:", pred)
+            subject_dict[str(i)] = pred
+        res[subject] = subject_dict
+    # 4. deinit & save
+    dump_json(res, submit_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-d', '--devid', type=str, default='0', help='device ID to use')
+    parser.add_argument('--model_path', type=str, help='Path to the bmodel file.')
+    parser.add_argument('--tokenizer_path', type=str, help='Path to the tokenizer file.')
+    parser.add_argument('--temperature', type=float, default=1.0, help='temperature scaling factor for the likelihood distribution')
+    parser.add_argument('--top_p', type=float, default=1.0, help='cumulative probability of token words to consider as a set of candidates')
+    parser.add_argument('--repeat_penalty', type=float, default=1.0, help='penalty for repeated tokens')
+    parser.add_argument('--repeat_last_n', type=int, default=32, help='repeat penalty for recent n tokens')
+    parser.add_argument('--max_new_tokens', type=int, default=1024, help='max new token length to generate')
+    parser.add_argument('--generation_mode', type=str, choices=["greedy", "penalty_sample"], default="greedy", help='mode for generating next token')
+    parser.add_argument('--prompt_mode', type=str, choices=["prompted", "unprompted"], default="prompted", help='use prompt format or original input')
+    parser.add_argument('--eval_mode', type=str, choices=["fast", "default"], default="default", help='eval_mode(fast or default)')
+    args = parser.parse_args()
+    main(args)

C-Eval/subject_mapping.json ADDED Viewed

	@@ -0,0 +1,262 @@

+{
+	"computer_network": [
+		"Computer Network",
+		"\u8ba1\u7b97\u673a\u7f51\u7edc",
+		"STEM"
+	],
+	"operating_system": [
+		"Operating System",
+		"\u64cd\u4f5c\u7cfb\u7edf",
+		"STEM"
+	],
+	"computer_architecture": [
+		"Computer Architecture",
+		"\u8ba1\u7b97\u673a\u7ec4\u6210",
+		"STEM"
+	],
+	"college_programming": [
+		"College Programming",
+		"\u5927\u5b66\u7f16\u7a0b",
+		"STEM"
+	],
+	"college_physics": [
+		"College Physics",
+		"\u5927\u5b66\u7269\u7406",
+		"STEM"
+	],
+	"college_chemistry": [
+		"College Chemistry",
+		"\u5927\u5b66\u5316\u5b66",
+		"STEM"
+	],
+	"advanced_mathematics": [
+		"Advanced Mathematics",
+		"\u9ad8\u7b49\u6570\u5b66",
+		"STEM"
+	],
+	"probability_and_statistics": [
+		"Probability and Statistics",
+		"\u6982\u7387\u7edf\u8ba1",
+		"STEM"
+	],
+	"discrete_mathematics": [
+		"Discrete Mathematics",
+		"\u79bb\u6563\u6570\u5b66",
+		"STEM"
+	],
+	"electrical_engineer": [
+		"Electrical Engineer",
+		"\u6ce8\u518c\u7535\u6c14\u5de5\u7a0b\u5e08",
+		"STEM"
+	],
+	"metrology_engineer": [
+		"Metrology Engineer",
+		"\u6ce8\u518c\u8ba1\u91cf\u5e08",
+		"STEM"
+	],
+	"high_school_mathematics": [
+		"High School Mathematics",
+		"\u9ad8\u4e2d\u6570\u5b66",
+		"STEM"
+	],
+	"high_school_physics": [
+		"High School Physics",
+		"\u9ad8\u4e2d\u7269\u7406",
+		"STEM"
+	],
+	"high_school_chemistry": [
+		"High School Chemistry",
+		"\u9ad8\u4e2d\u5316\u5b66",
+		"STEM"
+	],
+	"high_school_biology": [
+		"High School Biology",
+		"\u9ad8\u4e2d\u751f\u7269",
+		"STEM"
+	],
+	"middle_school_mathematics": [
+		"Middle School Mathematics",
+		"\u521d\u4e2d\u6570\u5b66",
+		"STEM"
+	],
+	"middle_school_biology": [
+		"Middle School Biology",
+		"\u521d\u4e2d\u751f\u7269",
+		"STEM"
+	],
+	"middle_school_physics": [
+		"Middle School Physics",
+		"\u521d\u4e2d\u7269\u7406",
+		"STEM"
+	],
+	"middle_school_chemistry": [
+		"Middle School Chemistry",
+		"\u521d\u4e2d\u5316\u5b66",
+		"STEM"
+	],
+	"veterinary_medicine": [
+		"Veterinary Medicine",
+		"\u517d\u533b\u5b66",
+		"STEM"
+	],
+	"college_economics": [
+		"College Economics",
+		"\u5927\u5b66\u7ecf\u6d4e\u5b66",
+		"Social Science"
+	],
+	"business_administration": [
+		"Business Administration",
+		"\u5de5\u5546\u7ba1\u7406",
+		"Social Science"
+	],
+	"marxism": [
+		"Marxism",
+		"\u9a6c\u514b\u601d\u4e3b\u4e49\u57fa\u672c\u539f\u7406",
+		"Social Science"
+	],
+	"mao_zedong_thought": [
+		"Mao Zedong Thought",
+		"\u6bdb\u6cfd\u4e1c\u601d\u60f3\u548c\u4e2d\u56fd\u7279\u8272\u793e\u4f1a\u4e3b\u4e49\u7406\u8bba\u4f53\u7cfb\u6982\u8bba",
+		"Social Science"
+	],
+	"education_science": [
+		"Education Science",
+		"\u6559\u80b2\u5b66",
+		"Social Science"
+	],
+	"teacher_qualification": [
+		"Teacher Qualification",
+		"\u6559\u5e08\u8d44\u683c",
+		"Social Science"
+	],
+	"high_school_politics": [
+		"High School Politics",
+		"\u9ad8\u4e2d\u653f\u6cbb",
+		"Social Science"
+	],
+	"high_school_geography": [
+		"High School Geography",
+		"\u9ad8\u4e2d\u5730\u7406",
+		"Social Science"
+	],
+	"middle_school_politics": [
+		"Middle School Politics",
+		"\u521d\u4e2d\u653f\u6cbb",
+		"Social Science"
+	],
+	"middle_school_geography": [
+		"Middle School Geography",
+		"\u521d\u4e2d\u5730\u7406",
+		"Social Science"
+	],
+	"modern_chinese_history": [
+		"Modern Chinese History",
+		"\u8fd1\u4ee3\u53f2\u7eb2\u8981",
+		"Humanities"
+	],
+	"ideological_and_moral_cultivation": [
+		"Ideological and Moral Cultivation",
+		"\u601d\u60f3\u9053\u5fb7\u4fee\u517b\u4e0e\u6cd5\u5f8b\u57fa\u7840",
+		"Humanities"
+	],
+	"logic": [
+		"Logic",
+		"\u903b\u8f91\u5b66",
+		"Humanities"
+	],
+	"law": [
+		"Law",
+		"\u6cd5\u5b66",
+		"Humanities"
+	],
+	"chinese_language_and_literature": [
+		"Chinese Language and Literature",
+		"\u4e2d\u56fd\u8bed\u8a00\u6587\u5b66",
+		"Humanities"
+	],
+	"art_studies": [
+		"Art Studies",
+		"\u827a\u672f\u5b66",
+		"Humanities"
+	],
+	"professional_tour_guide": [
+		"Professional Tour Guide",
+		"\u5bfc\u6e38\u8d44\u683c",
+		"Humanities"
+	],
+	"legal_professional": [
+		"Legal Professional",
+		"\u6cd5\u5f8b\u804c\u4e1a\u8d44\u683c",
+		"Humanities"
+	],
+	"high_school_chinese": [
+		"High School Chinese",
+		"\u9ad8\u4e2d\u8bed\u6587",
+		"Humanities"
+	],
+	"high_school_history": [
+		"High School History",
+		"\u9ad8\u4e2d\u5386\u53f2",
+		"Humanities"
+	],
+	"middle_school_history": [
+		"Middle School History",
+		"\u521d\u4e2d\u5386\u53f2",
+		"Humanities"
+	],
+	"civil_servant": [
+		"Civil Servant",
+		"\u516c\u52a1\u5458",
+		"Other"
+	],
+	"sports_science": [
+		"Sports Science",
+		"\u4f53\u80b2\u5b66",
+		"Other"
+	],
+	"plant_protection": [
+		"Plant Protection",
+		"\u690d\u7269\u4fdd\u62a4",
+		"Other"
+	],
+	"basic_medicine": [
+		"Basic Medicine",
+		"\u57fa\u7840\u533b\u5b66",
+		"Other"
+	],
+	"clinical_medicine": [
+		"Clinical Medicine",
+		"\u4e34\u5e8a\u533b\u5b66",
+		"Other"
+	],
+	"urban_and_rural_planner": [
+		"Urban and Rural Planner",
+		"\u6ce8\u518c\u57ce\u4e61\u89c4\u5212\u5e08",
+		"Other"
+	],
+	"accountant": [
+		"Accountant",
+		"\u6ce8\u518c\u4f1a\u8ba1\u5e08",
+		"Other"
+	],
+	"fire_engineer": [
+		"Fire Engineer",
+		"\u6ce8\u518c\u6d88\u9632\u5de5\u7a0b\u5e08",
+		"Other"
+	],
+	"environmental_impact_assessment_engineer": [
+		"Environmental Impact Assessment Engineer",
+		"\u73af\u5883\u5f71\u54cd\u8bc4\u4ef7\u5de5\u7a0b\u5e08",
+		"Other"
+	],
+	"tax_accountant": [
+		"Tax Accountant",
+		"\u7a0e\u52a1\u5e08",
+		"Other"
+	],
+	"physician": [
+		"Physician",
+		"\u533b\u5e08\u8d44\u683c",
+		"Other"
+	]
+}

Hisence/src/categories.py ADDED Viewed

	@@ -0,0 +1,147 @@

+name_en2zh = {
+    "agronomy": "农学",
+    "anatomy": "解剖学",
+    "ancient_chinese": "古汉语",
+    "arts": "艺术学",
+    "astronomy": "天文学",
+    "business_ethics": "商业伦理",
+    "chinese_civil_service_exam": "中国公务员考试",
+    "chinese_driving_rule": "中国驾驶规则",
+    "chinese_food_culture": "中国饮食文化",
+    "chinese_foreign_policy": "中国外交政策",
+    "chinese_history":"中国历史",
+    "chinese_literature": "中国文学",
+    "chinese_teacher_qualification": "中国教师资格",
+    "clinical_knowledge": "临床知识",
+    "college_actuarial_science":"大学精算学",
+    "college_education":"大学教育学",
+    "college_engineering_hydrology": "大学工程水文学",
+    "college_law": "大学法律",
+    "college_mathematics": "大学数学",
+    "college_medical_statistics":"大学医学统计",
+    "college_medicine": "大学医学",
+    "computer_science": "计算机科学",
+    "computer_security": "计算机安全",
+    "conceptual_physics": "概念物理学",
+    "construction_project_management": "建设工程管理",
+    "economics": "经济学",
+    "education": "教育学",
+    "electrical_engineering": "电气工程",
+    "elementary_chinese":"小学语文",
+    "elementary_commonsense":"小学常识",
+    "elementary_information_and_technology": "小学信息技术",
+    "elementary_mathematics": "初等数学",
+    "ethnology": "民族学",
+    "food_science": "食品科学",
+    "genetics": "遗传学",
+    "global_facts": "全球事实",
+    "high_school_biology": "高中生物",
+    "high_school_chemistry": "高中化学",
+    "high_school_geography": "高中地理",
+    "high_school_mathematics": "高中数学",
+    "high_school_physics": "高中物理学",
+    "high_school_politics": "高中政治",
+    "human_sexuality": "人类性行为",
+    "international_law": "国际法学",
+    "journalism": "新闻学",
+    "jurisprudence": "法理学",
+    "legal_and_moral_basis": "法律与道德基础",
+    "logical": "逻辑学",
+    "machine_learning": "机器学习",
+    "management": "管理学",
+    "marketing": "市场营销",
+    "marxist_theory": "马克思主义理论",
+    "modern_chinese": "现代汉语",
+    "nutrition": "营养学",
+    "philosophy": "哲学",
+    "professional_accounting": "专业会计",
+    "professional_law": "专业法学",
+    "professional_medicine": "专业医学",
+    "professional_psychology": "专业心理学",
+    "public_relations": "公共关系",
+    "security_study":"安全研究",
+    "sociology": "社会学",
+    "sports_science": "体育学",
+    "traditional_chinese_medicine": "中医中药",
+    "virology": "病毒学",
+    "world_history":"世界历史",
+    "world_religions": "世界宗教",
+}
+subcategories = {
+    "agronomy": ['other'],
+    "anatomy": ['biology'],
+    "ancient_chinese": ['linguistics','china specific'],
+    "arts": ['arts'],
+    "astronomy": ['physics'],
+    "business_ethics": ['business'],
+    "chinese_civil_service_exam": ['politics','china specific'],
+    "chinese_driving_rule": ['other','china specific'],
+    "chinese_food_culture": ['culture','china specific'],
+    "chinese_foreign_policy": ['politics','china specific'],
+    "chinese_history":['history','china specific'],
+    "chinese_literature": ['literature','china specific'],
+    "chinese_teacher_qualification": ['education','china specific'],
+    "college_actuarial_science":['math'],
+    "college_education":['education'],
+    "college_engineering_hydrology": ['engineering'],
+    "college_law": ['law'],
+    "college_mathematics": ['math'],
+    "college_medical_statistics":['statistics'],
+    "clinical_knowledge": ['other'],
+    "college_medicine": ['other'],
+    "computer_science": ['computer science'],
+    "computer_security": ['other'],
+    "conceptual_physics": ['physics'],
+    "construction_project_management": ['other','china specific'],
+    "economics": ['economics'],
+    "education": ['education'],
+    "elementary_chinese":['linguistics','china specific'],
+    "elementary_commonsense":['other','china specific'],
+    "elementary_information_and_technology": ['other'],
+    "electrical_engineering": ['engineering'],
+    "elementary_mathematics": ['math'],
+    "ethnology": ['culture','china specific'],
+    "food_science": ['other'],
+    "genetics": ['biology'],
+    "global_facts": ['global'],
+    "high_school_biology": ['biology'],
+    "high_school_chemistry": ['chemistry'],
+    "high_school_geography": ['geography'],
+    "high_school_mathematics": ['math'],
+    "high_school_physics": ['physics'],
+    "high_school_politics": ['politics','china specific'],
+    "human_sexuality": ['other'],
+    "international_law": ['law'],
+    "journalism": ['sociology'],
+    "jurisprudence": ['law'],
+    "legal_and_moral_basis": ['other'],
+    "logical": ['philosophy'],
+    "machine_learning": ['computer science'],
+    "management": ['business'],
+    "marketing": ['business'],
+    "marxist_theory": ['philosophy'],
+    "modern_chinese": ['linguistics','china specific'],
+    "nutrition": ['other'],
+    "philosophy": ['philosophy'],
+    "professional_accounting": ['business'],
+    "professional_law": ['law'],
+    "professional_medicine": ['other'],
+    "professional_psychology": ['psychology'],
+    "public_relations": ['politics'],
+    "security_study": ['politics'],
+    "sociology": ['culture'],
+    "sports_science": ['other'],
+    "traditional_chinese_medicine": ['other','china specific'],
+    "virology": ['biology'],
+    "world_history":['history'],
+    "world_religions": ['global'],
+}
+categories = {
+    "STEM": ["physics", "chemistry", "biology", "computer science", "math", "engineering", "statistics"],
+    "Humanities": ["history", "philosophy", "law", "arts", "literature", "global"],
+    "Social Science": ['linguistics',"business", "politics", "culture", "economics", "geography", "psychology", "education", "sociology"],
+    "Other":["other"],
+    "China specific": ["china specific"],
+}

Hisence/src/chatglm3.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import os
+import torch
+import numpy as np
+import argparse
+from mp_utils import choices, format_example, gen_prompt, softmax, run_eval, run_subject_eval
+from peft import PeftModel
+from transformers import AutoModel, AutoTokenizer
+def bmodel_infer(model, tokenizer, prompt, history):
+    answer_cur = ''
+    answer_token = []
+    tokens = tokenizer.build_chat_input(prompt, history=history)['input_ids'].tolist()[0]
+    answer_token = model.generate(tokens, tokenizer.eos_token_id)
+    answer_cur = tokenizer.decode(answer_token)
+    return answer_cur
+def bmodel_infer_fast(model, tokenizer, prompt, history):
+    answer_cur = ''
+    answer_token = []
+    tokens = tokenizer.build_chat_input(prompt, history=history)['input_ids'].tolist()[0]
+    answer_token = model.forward_first(tokens)
+    answer_cur = tokenizer.decode(answer_token)
+    return answer_cur
+def eval_chat(model, tokenizer, subject, dev_df, test_df, num_few_shot, max_length, cot, device):
+    cors = []
+    all_preds = []
+    answers = choices[: test_df.shape[1] - 2]
+    for i in range(test_df.shape[0]):
+        prompt_end = format_example(test_df, i, subject, include_answer=False, cot=cot)
+        prompt = gen_prompt(dev_df=dev_df,
+                            subject=subject,
+                            prompt_end=prompt_end,
+                            num_few_shot=num_few_shot,
+                            tokenizer=tokenizer,
+                            max_length=max_length,
+                            cot=cot)
+        label = test_df.iloc[i, test_df.shape[1] - 1]
+        if device == "cuda":
+            pred, history = model.chat(tokenizer, prompt, history=[])
+            print("prompt:", prompt)
+            print("pred:", pred)
+            print("label", label)
+        elif device == "tpu":
+            pred = bmodel_infer_fast(model, tokenizer, prompt, history = [])
+            print()
+            print()
+            print("================================================")
+            print("prompt:", prompt)
+            if pred:
+                print("pred:", pred)
+                print("pred[0]:", pred[0])
+                print("acc:", bool(pred[0] == label))
+            print("label", label)
+        if pred and pred[0] in choices:
+            cors.append(pred[0] == label)
+        all_preds.append(pred.replace("\n", ""))
+    acc = np.mean(cors)
+    print("Average accuracy {:.3f} - {}".format(acc, subject))
+    print("{} results, {} inappropriate formated answers.".format(len(cors), len(all_preds)-len(cors)))
+    return acc, all_preds, None
+all = [
+    "agronomy",
+    "anatomy",
+    "ancient_chinese",
+    "arts",
+    "astronomy",
+    "business_ethics",
+    "chinese_civil_service_exam",
+    "chinese_driving_rule",
+    "chinese_food_culture",
+    "chinese_foreign_policy",
+    "chinese_history",
+    "chinese_literature",
+    "chinese_teacher_qualification",
+    "clinical_knowledge",
+    "college_actuarial_science",
+    "college_education",
+    "college_engineering_hydrology",
+    "college_law",
+    "college_mathematics",
+    "college_medical_statistics",
+    "college_medicine",
+    "computer_science",
+    "computer_security",
+    "conceptual_physics",
+    "construction_project_management",
+    "economics",
+    "education",
+    "electrical_engineering",
+    "elementary_chinese",
+    "elementary_commonsense",
+    "elementary_information_and_technology",
+    "elementary_mathematics",
+    "ethnology",
+    "food_science",
+    "genetics",
+    "global_facts",
+    "high_school_biology",
+    "high_school_chemistry",
+    "high_school_geography",
+    "high_school_mathematics",
+    "high_school_physics",
+    "high_school_politics",
+    "human_sexuality",
+    "international_law",
+    "journalism",
+    "jurisprudence",
+    "legal_and_moral_basis",
+    "logical",
+    "machine_learning",
+    "management",
+    "marketing",
+    "marxist_theory",
+    "modern_chinese",
+    "nutrition",
+    "philosophy",
+    "professional_accounting",
+    "professional_law",
+    "professional_medicine",
+    "professional_psychology",
+    "public_relations",
+    "security_study",
+    "sociology",
+    "sports_science",
+    "traditional_chinese_medicine",
+    "virology",
+    "world_history",
+    "world_religions"
+]
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name_or_path", type=str, default="")
+    parser.add_argument("--lora_weights", type=str, default="")
+    parser.add_argument("--data_dir", type=str, default="data")
+    parser.add_argument("--save_dir", type=str, default="results/ChatGLM-6B")
+    parser.add_argument("--num_few_shot", type=int, default=0)
+    parser.add_argument("--max_length", type=int, default=2048)
+    parser.add_argument("--load_in_8bit", action='store_true')
+    parser.add_argument("--subjects", type=str, nargs='+', default= all) #['high_school_geography','electrical_engineering'])
+    parser.add_argument("--cot", action='store_true')
+    parser.add_argument("--device", type=str, choices=["cuda", "tpu"], default="cuda")
+    parser.add_argument('--model_path', type=str, required=True, help='path to the bmodel file')
+    parser.add_argument('--top_p', type=float, default=1.0, help='cumulative probability of token words to consider as a set of candidates')
+    parser.add_argument('--repeat_penalty', type=float, default=1.0, help='penalty for repeated tokens')
+    parser.add_argument('--repeat_last_n', type=int, default=32, help='repeat penalty for recent n tokens')
+    parser.add_argument('--max_new_tokens', type=int, default=1024, help='max new token length to generate')
+    parser.add_argument('--temperature', type=float, default=1.0, help='temperature scaling factor for the likelihood distribution')
+    parser.add_argument("--devid", type=str, default='0')
+    parser.add_argument("--tokenizer_path", type=str, default="")
+    parser.add_argument('--generation_mode', type=str, default="greedy", help='mode for generating next token.')
+    parser.add_argument('--prompt_mode', type=str, choices=["prompted", "unprompted"], default="prompted", help='use prompt format or original input')
+    args = parser.parse_args()
+    # Initialize models
+    if args.device == 'cuda':
+        device = torch.device("cpu")
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, trust_remote_code=True,)
+        model = AutoModel.from_pretrained(args.model_name_or_path,
+                                          trust_remote_code=True, torch_dtype=torch.float)
+                                    # load_in_8bit=args.load_in_8bit,
+                                    # ).half().cuda()
+        model.to(device)
+    elif args.device == "tpu":
+        from ChatGLM3.python_demo import chat
+        devices = [int(d) for d in args.devid.split(",")]
+        model = chat.ChatGLM()
+        model.init(devices, args.model_path)
+        model.temperature = args.temperature
+        model.top_p = args.top_p
+        model.repeat_penalty = args.repeat_penalty
+        model.repeat_last_n = args.repeat_last_n
+        model.max_new_tokens = args.max_new_tokens
+        model.generation_mode = args.generation_mode
+        model.prompt_mode = args.prompt_mode
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, trust_remote_code=True)
+    print("subject:", args.subjects)
+    # Always use Chat-style evaluation
+    # run_eval(model, tokenizer, eval_chat, args)
+    run_subject_eval(model, tokenizer, eval_chat, args)

Hisence/src/mp_utils.py ADDED Viewed

	@@ -0,0 +1,241 @@

+import os
+import re
+import glob
+import random
+import os.path as osp
+import numpy as np
+import pandas as pd
+from collections import defaultdict
+from categories import name_en2zh, subcategories, categories
+choices = ["A", "B", "C", "D"]
+category2subject = defaultdict(list)
+for k,v in categories.items():
+    for subject, subcat in subcategories.items():
+        for c in subcat:
+            if c in v:
+                category2subject[k].append(subject)
+def format_example(df, idx, subject, include_answer=True, cot=False):
+    prompt_start = "题目："
+    prompt = prompt_start + df.iloc[idx, 0]
+    k = df.shape[1] - 2
+    for j in range(k):
+        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
+    # Chain-of-thought
+    if cot:
+        prompt += "\n逐步分析并给出答案选项。"
+    else:
+        prompt += "\n答案是："
+    if include_answer:
+        prompt += "{}\n\n".format(df.iloc[idx, k + 1])
+    return prompt
+def gen_prompt(dev_df, subject, prompt_end, num_few_shot=0, tokenizer=None, max_length=2048, cot=False):
+    if cot: # Chain-of-thought
+        prompt = "以下是关于{}的单项选择题，请分析并选出正确答案。\n\n".format(name_en2zh[subject])
+    else:
+        prompt = "以下是关于{}的单项选择题，请直接给出正确答案的选项。\n\n".format(name_en2zh[subject])
+    # If no tokenizer, don't consider max length.
+    if tokenizer==None:
+        for i in range(num_few_shot):
+            example = format_example(dev_df, i, subject)
+            prompt += example
+        return prompt + prompt_end
+    start_end_token_len = len(tokenizer.encode(prompt)+tokenizer.encode(prompt_end))
+    # If cannot fit in model even without training data, remove the prompt at the beginning.
+    if start_end_token_len>max_length:
+        return prompt_end
+    prompt_list = []
+    if num_few_shot > 0:
+        for i in range(num_few_shot):
+            example = format_example(dev_df, i, subject)
+            prompt_list.append((example, tokenizer.encode(example)))
+        while prompt_list != [] and sum(len(e[1]) for e in prompt_list) >= max_length - start_end_token_len:
+            print(f"Warning: {len(prompt_list)} shot case exceeds max_input_length, remove 1 shot.")
+            longest_length = max([len(e[1]) for e in prompt_list])
+            prompt_list = [e for e in prompt_list if len(e[1]) != longest_length]
+        for p in prompt_list:
+            prompt += p[0]
+    return prompt + prompt_end
+def softmax(x):
+    z = x - max(x)
+    numerator = np.exp(z)
+    denominator = np.sum(numerator)
+    softmax = numerator/denominator
+    return softmax
+def run_subject_eval(model, tokenizer, eval, args):
+    # subjects=sorted([f.split(".csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test/"))])
+    subjects = args.subjects
+    args.save_dir = f"{args.save_dir}_{args.num_few_shot}_shot"
+    if not os.path.exists(args.save_dir):
+        os.mkdir(args.save_dir)
+    for subject in subjects:
+        out_file = os.path.join(args.save_dir, f"results_{subject}.csv")
+        # if os.path.exists(out_file):  # If result file exist, skip this subject
+        #     continue
+        dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + ".csv"), header=0, index_col=0)
+        test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + ".csv"), header=0, index_col=0)
+        acc, preds, confs = eval(model=model,
+                                 tokenizer=tokenizer,
+                                 subject=subject,
+                                 dev_df=dev_df,
+                                 test_df=test_df,
+                                 num_few_shot=args.num_few_shot,
+                                 max_length=args.max_length,
+                                 cot=args.cot if 'cot' in args else False,
+                                 device=args.device)
+        test_df['prediction'] = preds
+        if 'with_conf' in args and args.with_conf:
+            test_df['conf'] = confs
+        test_df.to_csv(out_file, header=None, mode="w")
+    # print result
+    get_results(args.save_dir)
+def run_eval(model, tokenizer, eval, args):
+    if model:
+        model.eval()
+    subjects=sorted([f.split(".csv")[0] for f in os.listdir(os.path.join(args.data_dir, "test/"))])
+    args.save_dir = f"{args.save_dir}_{args.num_few_shot}_shot"
+    if not os.path.exists(args.save_dir):
+        os.mkdir(args.save_dir)
+    for subject in subjects:
+        out_file = os.path.join(args.save_dir, f"results_{subject}.csv")
+        if os.path.exists(out_file):  # If result file exist, skip this subject
+            continue
+        dev_df = pd.read_csv(os.path.join(args.data_dir, "dev", subject + ".csv"), header=0, index_col=0)
+        test_df = pd.read_csv(os.path.join(args.data_dir, "test", subject + ".csv"), header=0, index_col=0)
+        acc, preds, confs = eval(model=model,
+                                 tokenizer=tokenizer,
+                                 subject=subject,
+                                 dev_df=dev_df,
+                                 test_df=test_df,
+                                 num_few_shot=args.num_few_shot,
+                                 max_length=args.max_length,
+                                 cot=args.cot if 'cot' in args else False)
+        test_df['prediction'] = preds
+        if 'with_conf' in args and args.with_conf:
+            test_df['conf'] = confs
+        test_df.to_csv(out_file, header=None)
+    # print result
+    get_results(args.save_dir)
+def extract_choice(response):
+    '''
+        Always return a choice, even cannot match by regex,
+        to ensure fair comparison to other models.
+    '''
+    response = str(response)
+    if response[0] in choices:
+        return response[0]
+    # 1. Single match
+    patterns = [
+        (r'答案(选项)?(是|为)：? ?([ABCD])', 3),
+        (r'答案(是|为)选项 ?([ABCD])', 2),
+        (r'故?选择?：? ?([ABCD])',1),
+        (r'([ABCD]) ?选?项(是|为)?正确',1),
+        (r'正确的?选项(是|为) ?([ABCD])',2),
+        (r'答案(应该)?(是|为)([ABCD])',3),
+        (r'选项 ?([ABCD]) ?(是|为)?正确',1),
+        (r'选择答案 ?([ABCD])',1),
+        (r'答案?：?([ABCD])',1),
+        (r'([ABCD])(选?项)?是?符合题意',1),
+        (r'答案选项：? ?([ABCD])', 1), # chatglm
+        (r'答案(选项)?为(.*?)([ABCD])', 3), # chatgpt
+    ]
+    for pattern,idx in patterns:
+        m = re.search(pattern, response, re.M)
+        if m:
+            answer = m.group(idx)
+            assert answer in choices
+            return answer
+    # 2. Recursive match
+    patterns = [
+        (r'([ABCD])(.*?)当选', 1),
+        (r'([ABCD])(.*?)正确', 1),
+    ]
+    for pattern,idx in patterns:
+        m = re.search(pattern, response, re.M)
+        if m:
+            while m:
+                answer = m.group(idx)
+                m = re.search(pattern, m.group(0)[1:], re.M)
+            assert answer in choices
+            return answer
+    # 3. Weak single match
+    patterns = [
+        (r'[^不]是：? ?([ABCD])', 1),
+    ]
+    for pattern,idx in patterns:
+        m = re.search(pattern, response, re.M)
+        if m:
+            answer = m.group(idx)
+            assert answer in choices
+            return answer
+    # 4. Check the only mentioend choices
+    pattern = r'^[^ABCD]*([ABCD])[^ABCD]*$'
+    m = re.match(pattern, response)
+    if m:
+        answer = m.group(1)
+        assert answer in choices
+        return answer
+    return choices[random.randint(0,3)]
+def get_results(result_dir=''):
+    all_acc = defaultdict(float)
+    all_df = []
+    for subject in name_en2zh.keys():
+        try:
+            file = glob.glob(osp.join(result_dir, f"results_{subject}.csv"))[0]
+        except:
+            print(f"Warning, {subject} result file not found")
+            continue
+        df = pd.read_csv(file, names=['id','question','A','B','C','D','answer','response'], index_col=0)
+        # To deal with some mismath between data and answer
+        if df.iloc[0]['question'] == '1':
+            df = df.drop(0)
+        df['pred'] = df['response'].apply(extract_choice)
+        df['acc'] = df['answer'] == df['pred']
+        acc = np.mean(df['acc']) * 100
+        all_acc[subject]=acc
+        all_df.append(df)
+    all_df = pd.concat(all_df)
+    for k, v in category2subject.items():
+        avg_acc = np.mean(list(map(lambda x: all_acc[x], v)))
+        print(f"{k:40s} {avg_acc:.2f}")
+    avg_all_acc = np.mean(list(all_acc.values()))
+    print(f"{'Overall':30s} {avg_all_acc:.2f}")
+    return all_acc

Hisence/src/qwen1.5.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import os
+import torch
+import numpy as np
+import argparse
+from tqdm import tqdm
+from mp_utils import choices, format_example, gen_prompt, softmax, run_eval, run_subject_eval
+from transformers import AutoModel, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def eval_chat(model, tokenizer, subject, dev_df, test_df, num_few_shot, max_length, cot, device):
+    cors = []
+    all_preds = []
+    answers = choices[: test_df.shape[1] - 2]
+    for i in tqdm(range(test_df.shape[0])):
+        prompt_end = format_example(test_df, i, subject, include_answer=False, cot=cot)
+        prompt = gen_prompt(dev_df=dev_df,
+                            subject=subject,
+                            prompt_end=prompt_end,
+                            num_few_shot=num_few_shot,
+                            tokenizer=tokenizer,
+                            max_length=max_length,
+                            cot=cot)
+        label = test_df.iloc[i, test_df.shape[1] - 1]
+        #根据prompt推理结果
+        # prompt = "你好"
+        messages = [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": prompt}
+        ]
+        text = tokenizer.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        if device == "cuda":
+            model_inputs = tokenizer([text], return_tensors="pt").to(device)
+            generated_ids = model.generate(
+                model_inputs.input_ids,
+                max_new_tokens=512,
+                temperature=0.5
+            )
+            generated_ids = [
+                output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+            ]
+        elif device == "tpu":
+            model_inputs = tokenizer([text])
+            generated_ids = model.generate(
+                model_inputs.input_ids[0],
+                tokenizer.eos_token_id
+            )
+        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # print(response)
+        if response and response[0] in choices:
+            cors.append(response[0] == label)
+        all_preds.append(response.replace("\n", ""))
+    acc = np.mean(cors)
+    print("Average accuracy {:.3f} - {}".format(acc, subject))
+    print("{} results, {} inappropriate formated answers.".format(len(cors), len(all_preds)-len(cors)))
+    return acc, all_preds, None
+all = [
+    "agronomy",
+    "anatomy",
+    "ancient_chinese",
+    "arts",
+    "astronomy",
+    "business_ethics",
+    "chinese_civil_service_exam",
+    "chinese_driving_rule",
+    "chinese_food_culture",
+    "chinese_foreign_policy",
+    "chinese_history",
+    "chinese_literature",
+    "chinese_teacher_qualification",
+    "clinical_knowledge",
+    "college_actuarial_science",
+    "college_education",
+    "college_engineering_hydrology",
+    "college_law",
+    "college_mathematics",
+    "college_medical_statistics",
+    "college_medicine",
+    "computer_science",
+    "computer_security",
+    "conceptual_physics",
+    "construction_project_management",
+    "economics",
+    "education",
+    "electrical_engineering",
+    "elementary_chinese",
+    "elementary_commonsense",
+    "elementary_information_and_technology",
+    "elementary_mathematics",
+    "ethnology",
+    "food_science",
+    "genetics",
+    "global_facts",
+    "high_school_biology",
+    "high_school_chemistry",
+    "high_school_geography",
+    "high_school_mathematics",
+    "high_school_physics",
+    "high_school_politics",
+    "human_sexuality",
+    "international_law",
+    "journalism",
+    "jurisprudence",
+    "legal_and_moral_basis",
+    "logical",
+    "machine_learning",
+    "management",
+    "marketing",
+    "marxist_theory",
+    "modern_chinese",
+    "nutrition",
+    "philosophy",
+    "professional_accounting",
+    "professional_law",
+    "professional_medicine",
+    "professional_psychology",
+    "public_relations",
+    "security_study",
+    "sociology",
+    "sports_science",
+    "traditional_chinese_medicine",
+    "virology",
+    "world_history",
+    "world_religions"
+]
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name_or_path", type=str, default="")
+    parser.add_argument("--lora_weights", type=str, default="")
+    parser.add_argument("--data_dir", type=str, default="data")
+    parser.add_argument("--save_dir", type=str, default="results/ChatGLM-6B")
+    parser.add_argument("--num_few_shot", type=int, default=0)
+    parser.add_argument("--max_length", type=int, default=2048)
+    parser.add_argument("--load_in_8bit", action='store_true')
+    parser.add_argument("--subjects", type=str, nargs='+', default= all) #['high_school_geography','electrical_engineering'])
+    parser.add_argument("--cot", action='store_true')
+    parser.add_argument("--device", type=str, choices=["cuda", "tpu"], default="cuda")
+    parser.add_argument('--model_path', type=str, required=True, help='path to the bmodel file')
+    parser.add_argument('--top_p', type=float, default=1.0, help='cumulative probability of token words to consider as a set of candidates')
+    parser.add_argument('--max_new_tokens', type=int, default=1024, help='max new token length to generate')
+    parser.add_argument('--temperature', type=float, default=1.0, help='temperature scaling factor for the likelihood distribution')
+    parser.add_argument("--devid", type=str, default='0')
+    parser.add_argument("--tokenizer_path", type=str, default="")
+    parser.add_argument('--generation_mode', type=str, default="greedy", help='mode for generating next token.')
+    parser.add_argument('--prompt_mode', type=str, choices=["prompted", "unprompted"], default="prompted", help='use prompt format or original input')
+    args = parser.parse_args()
+    # Initialize models
+    if args.device == "cuda":
+        model = AutoModelForCausalLM.from_pretrained(
+            args.model_name_or_path,
+            torch_dtype="auto",
+            device_map="auto"
+        ).eval()
+        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    elif args.device == "tpu":
+        from Qwen1_5.python_demo import chat
+        devices = [int(d) for d in args.devid.split(",")]
+        model = chat.Qwen()
+        model.init(
+            devices,
+            args.model_path,
+            args.temperature,
+            args.top_p,
+            args.max_new_tokens,
+            args.generation_mode,
+            args.prompt_mode,
+        )
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
+    print("subject:", args.subjects)
+    # Always use Chat-style evaluation
+    # run_eval(model, tokenizer, eval_chat, args)
+    run_subject_eval(model, tokenizer, eval_chat, args)

MMLU/README.md ADDED Viewed

	@@ -0,0 +1,7 @@

+## Command
+```
+wget https://people.eecs.berkeley.edu/~hendrycks/data.tar
+python evaluate_chatglm3.py --devid 10 --model_path ../../models/ChatGLM3/compile/chatglm3-6b_int4_1dev.bmodel --tokenizer_path ../../models/ChatGLM3/support/tokenizer.model
+```

MMLU/evaluate_chatglm3.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import argparse
+import os
+import torch
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from ChatGLM3.python_demo import chat
+choices = ["A", "B", "C", "D"]
+def format_subject(subject):
+    l = subject.split("_")
+    s = ""
+    for entry in l:
+        s += " " + entry
+    return s
+def format_example(df, idx, include_answer=True):
+    prompt = df.iloc[idx, 0]
+    k = df.shape[1] - 2
+    for j in range(k):
+        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j + 1])
+    prompt += "\nAnswer:"
+    if include_answer:
+        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
+    return prompt
+def gen_prompt(train_df, subject, k=-1):
+    prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(
+        format_subject(subject)
+    )
+    if k == -1:
+        k = train_df.shape[0]
+    for i in range(k):
+        prompt += format_example(train_df, i)
+    return prompt
+def main(args):
+    # 1. define params
+    example_num = 0
+    subjects = sorted(
+        [
+            f.split("_test.csv")[0]
+            for f in os.listdir(os.path.join(args.data_dir, "test"))
+            if "_test.csv" in f
+        ]
+    )
+    # 2. create engine
+    devices = [int(d) for d in args.devid.split(",")]
+    engine = chat.ChatGLM()
+    engine.init(devices, args.model_path, args.tokenizer_path)
+    # 3. construct prompt & inference
+    all_cors = []
+    for subject in subjects:
+        dev_df = pd.read_csv(
+            os.path.join(args.data_dir, "dev", subject + "_dev.csv"), header=None
+        )[: example_num]
+        test_df = pd.read_csv(
+            os.path.join(args.data_dir, "test", subject + "_test.csv"), header=None
+        )
+        cors = []
+        for i in tqdm(range(len(test_df))):
+            prompt_end = format_example(test_df, i, include_answer=False)
+            few_shot_prompt = gen_prompt(dev_df, subject, example_num)
+            prompt = few_shot_prompt + prompt_end
+            pred = engine.predict_option(prompt)
+            label = test_df.iloc[i, test_df.shape[1] - 1]
+            cors.append(pred == label)
+        weighted_acc = np.mean(cors)
+        print("Average accuracy: {:.3f}".format(weighted_acc))
+        all_cors.append(cors)
+    # deinit & compute acc
+    engine.deinit()
+    weighted_acc = np.mean(np.concatenate(all_cors))
+    print("Average accuracy: {:.3f}".format(weighted_acc))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_dir", "-d", type=str, default="data")
+    parser.add_argument('--devid', type=str, help='Device ID to use.')
+    parser.add_argument('--model_path', type=str, help='Path to the bmodel file.')
+    parser.add_argument('--tokenizer_path', type=str, help='Path to the tokenizer file.')
+    args = parser.parse_args()
+    main(args)