Spaces:
Sleeping
Sleeping
IrinaArmstrong
commited on
Commit
β’
939f502
1
Parent(s):
aaa657c
added info & about descriptions, fixed model types
Browse files- app.py +1 -1
- src/about.py +56 -5
- src/leaderboard/read_evals.py +8 -1
- src/submission/submit.py +119 -119
app.py
CHANGED
@@ -143,7 +143,7 @@ with demo:
|
|
143 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
144 |
|
145 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
146 |
-
with gr.TabItem("π
LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
147 |
with gr.Row():
|
148 |
with gr.Column():
|
149 |
with gr.Row():
|
|
|
143 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
144 |
|
145 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
146 |
+
with gr.TabItem("π
MindShift LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
147 |
with gr.Row():
|
148 |
with gr.Column():
|
149 |
with gr.Row():
|
src/about.py
CHANGED
@@ -30,20 +30,70 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
30 |
|
31 |
|
32 |
# Your leaderboard name
|
33 |
-
TITLE = """<h1 align="center" id="space-title">MindShift
|
34 |
|
35 |
# What does your leaderboard evaluate?
|
36 |
INTRODUCTION_TEXT = """
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
"""
|
39 |
|
40 |
# Which evaluations are you running? how can people reproduce what you have?
|
41 |
LLM_BENCHMARKS_TEXT = f"""
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
To reproduce our results, here is the commands you can run:
|
46 |
|
|
|
|
|
47 |
"""
|
48 |
|
49 |
EVALUATION_QUEUE_TEXT = """
|
@@ -78,4 +128,5 @@ If everything is done, check you can launch the EleutherAIHarness on your model
|
|
78 |
|
79 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
80 |
CITATION_BUTTON_TEXT = r"""
|
|
|
81 |
"""
|
|
|
30 |
|
31 |
|
32 |
# Your leaderboard name
|
33 |
+
TITLE = """<h1 align="center" id="space-title">MindShift: Analyzing LLMs Reactions to Psychological Prompts</h1>"""
|
34 |
|
35 |
# What does your leaderboard evaluate?
|
36 |
INTRODUCTION_TEXT = """
|
37 |
+
Welcome to the leaderboard of the MindShift!
|
38 |
+
|
39 |
+
Have you ever wondered how you can measure how much your LLM is following the role it has been given? Or how depressed or optimistic it is?
|
40 |
+
|
41 |
+
For this purpose, we offer you a handy tool - πMindShift.
|
42 |
+
|
43 |
+
πMindShift - is a benchmark for assessing the psychological susceptibility of LLMs, such as perception, recognition and role performance with psychological characteristics. It is based on an AI model adaptation of the human psychometric person-oriented test (Minnesota Multiphasic Personality Inventory (MMPI)).
|
44 |
+
|
45 |
+
It is easy to use and can assess any LLM - both instructively tuned and in its basic version. Its scales, which are easily interpreted by humans, allow you to choose the appropriate language model for your conversational assistant or a game NPC.
|
46 |
+
|
47 |
+
π€More details on the measurement approach, roles and psychological biases can be found in the 'πAbout' tab. See also the paper (πcoming soon!).
|
48 |
"""
|
49 |
|
50 |
# Which evaluations are you running? how can people reproduce what you have?
|
51 |
LLM_BENCHMARKS_TEXT = f"""
|
52 |
+
Large language models (LLMs) hold the potential to absorb and reflect personality traits and attitudes specified by users.
|
53 |
+
|
54 |
+
<div style='display: flex; align-items: center; justify-content: center; text-align: center;'>
|
55 |
+
<img src='https://github.com/IrinaArmstrong/MindShift/blob/master/figs/mindshift-concept.png' style='width: 600px; height: auto; margin-right: 10px;' />
|
56 |
+
</div>
|
57 |
+
|
58 |
+
## How it works?
|
59 |
+
|
60 |
+
### Questions & Scales
|
61 |
+
To reliably validate the implicit understanding of psychological personality traits in LLMs, it is crucial to adapt psychological interpretations of the scales and formulate questions specific to the language models. When asked explicit questions about inner worlds, morality, and behavioral patterns, LLMs may exhibit biased behaviors due to extensive alignment tuning. This can result in inconsistent and unrepresentative questionnaire outcomes.
|
62 |
+
|
63 |
+
To assess the susceptibility of LLMs to personalization, we utilized the Standardized Multifactorial Method for Personality Research (SMMPR), which is based on the Minnesota Multiphasic Personality Inventory (MMPI). It is a questionnaire-based test consisting of 566 short statements that individuals rate as true or false for themselves.
|
64 |
+
The test assesses psychological characteristics on 10 basic "personality profile" scales, named after the nosological forms of corresponding disorders:
|
65 |
+
* Hypochondria (Hs),
|
66 |
+
* Depression (D),
|
67 |
+
* Emotional Lability (Hy),
|
68 |
+
* Psychopathy (Pd),
|
69 |
+
* Masculinity-Femininity (Mf),
|
70 |
+
* Rigidity/Paranoia (Pa),
|
71 |
+
* Anxiety/Psychasthenia (Pf),
|
72 |
+
* Individualism/Schizophrenia (Sc),
|
73 |
+
* Optimism (Ma),
|
74 |
+
* Social Introversion (Si).
|
75 |
+
|
76 |
+
Additionally, the test includes three validation scales to assess the truthfulness and sincerity of the respondent's answers: Lie (L), Infrequency (F), and Defensiveness (D).
|
77 |
+
|
78 |
+
To ensure the reproducibility of our methodology for both instructively tuned and basic versions, we leveraged the LLM's ability to complete textual queries. We constructed a set of statements from the questionnaire and asked LLM to finish the prompt with only one option: True or False.
|
79 |
+
|
80 |
+
<div style='display: flex; align-items: center; justify-content: center; text-align: center;'>
|
81 |
+
<img src='https://github.com/IrinaArmstrong/MindShift/blob/master/figs/mindshift-statements.png' style='width: 600px; height: auto; margin-right: 10px;' />
|
82 |
+
</div>
|
83 |
+
|
84 |
+
### Psychological prompts
|
85 |
+
|
86 |
+
To measure the extent to which an LLM understands personality, MindShift at its core contains a structured method for introducing psychologically oriented biases into prompts.
|
87 |
+
Introducing specific personality traits into an LLM can be achieved by providing it with a natural language description of the persona. In our methodology, the persona description consists of two parts: the Persona General Descriptor and the Psychological Bias Descriptor. The Persona General Descriptor includes general statements about the character's lifestyle, routines, and social aspects, while the Psychological Bias Descriptor covers specific psychological attitudes with varying degrees of intensity.
|
88 |
+
|
89 |
+
<div style='display: flex; align-items: center; justify-content: center; text-align: center;'>
|
90 |
+
<img src='https://github.com/IrinaArmstrong/MindShift/blob/master/figs/mindshift-input-schema.png' style='width: 600px; height: auto; margin-right: 10px;' />
|
91 |
+
</div>
|
92 |
|
93 |
+
They are combined with Persona General Descriptor - a full character role (including gender, age, marital status, personal circumstances, hobbies, etc.), sampled from PersonaChat dialogue dataset. Together they form a complete description of the persona.
|
|
|
94 |
|
95 |
+
### Paper
|
96 |
+
You can find more details about the assessment, a list of psychological prompts, roles and experiments in the paper (πcoming soon!).
|
97 |
"""
|
98 |
|
99 |
EVALUATION_QUEUE_TEXT = """
|
|
|
128 |
|
129 |
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
130 |
CITATION_BUTTON_TEXT = r"""
|
131 |
+
(πcoming soon!)
|
132 |
"""
|
src/leaderboard/read_evals.py
CHANGED
@@ -47,6 +47,12 @@ class EvalResult:
|
|
47 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
48 |
org_and_model = org_and_model.split("/", 1)
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
if len(org_and_model) == 1:
|
51 |
org = None
|
52 |
model = org_and_model[0]
|
@@ -85,7 +91,8 @@ class EvalResult:
|
|
85 |
org=org,
|
86 |
model=model,
|
87 |
results=results,
|
88 |
-
precision=precision,
|
|
|
89 |
revision=config.get("model_sha", ""),
|
90 |
still_on_hub=still_on_hub,
|
91 |
architecture=architecture
|
|
|
47 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
48 |
org_and_model = org_and_model.split("/", 1)
|
49 |
|
50 |
+
model_type = ModelType.Unknown
|
51 |
+
if ("instruct" in org_and_model[-1].lower()) or ("-it" in org_and_model[-1].lower()):
|
52 |
+
model_type = ModelType.from_str("instruction-tuned")
|
53 |
+
else:
|
54 |
+
model_type = ModelType.from_str("pretrained")
|
55 |
+
|
56 |
if len(org_and_model) == 1:
|
57 |
org = None
|
58 |
model = org_and_model[0]
|
|
|
91 |
org=org,
|
92 |
model=model,
|
93 |
results=results,
|
94 |
+
precision=precision,
|
95 |
+
model_type=model_type,
|
96 |
revision=config.get("model_sha", ""),
|
97 |
still_on_hub=still_on_hub,
|
98 |
architecture=architecture
|
src/submission/submit.py
CHANGED
@@ -1,119 +1,119 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
from datetime import datetime, timezone
|
4 |
-
|
5 |
-
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
-
from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
7 |
-
from src.submission.check_validity import (
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
)
|
13 |
-
|
14 |
-
REQUESTED_MODELS = None
|
15 |
-
USERS_TO_SUBMISSION_DATES = None
|
16 |
-
|
17 |
-
def add_new_eval(
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
):
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
|
|
1 |
+
# import json
|
2 |
+
# import os
|
3 |
+
# from datetime import datetime, timezone
|
4 |
+
#
|
5 |
+
# from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
+
# from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
7 |
+
# from src.submission.check_validity import (
|
8 |
+
# already_submitted_models,
|
9 |
+
# check_model_card,
|
10 |
+
# get_model_size,
|
11 |
+
# is_model_on_hub,
|
12 |
+
# )
|
13 |
+
#
|
14 |
+
# REQUESTED_MODELS = None
|
15 |
+
# USERS_TO_SUBMISSION_DATES = None
|
16 |
+
#
|
17 |
+
# def add_new_eval(
|
18 |
+
# model: str,
|
19 |
+
# base_model: str,
|
20 |
+
# revision: str,
|
21 |
+
# precision: str,
|
22 |
+
# weight_type: str,
|
23 |
+
# model_type: str,
|
24 |
+
# ):
|
25 |
+
# global REQUESTED_MODELS
|
26 |
+
# global USERS_TO_SUBMISSION_DATES
|
27 |
+
# if not REQUESTED_MODELS:
|
28 |
+
# REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
29 |
+
#
|
30 |
+
# user_name = ""
|
31 |
+
# model_path = model
|
32 |
+
# if "/" in model:
|
33 |
+
# user_name = model.split("/")[0]
|
34 |
+
# model_path = model.split("/")[1]
|
35 |
+
#
|
36 |
+
# precision = precision.split(" ")[0]
|
37 |
+
# current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
38 |
+
#
|
39 |
+
# if model_type is None or model_type == "":
|
40 |
+
# return styled_error("Please select a model type.")
|
41 |
+
#
|
42 |
+
# # Does the model actually exist?
|
43 |
+
# if revision == "":
|
44 |
+
# revision = "main"
|
45 |
+
#
|
46 |
+
# # Is the model on the hub?
|
47 |
+
# if weight_type in ["Delta", "Adapter"]:
|
48 |
+
# base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
|
49 |
+
# if not base_model_on_hub:
|
50 |
+
# return styled_error(f'Base model "{base_model}" {error}')
|
51 |
+
#
|
52 |
+
# if not weight_type == "Adapter":
|
53 |
+
# model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
|
54 |
+
# if not model_on_hub:
|
55 |
+
# return styled_error(f'Model "{model}" {error}')
|
56 |
+
#
|
57 |
+
# # Is the model info correctly filled?
|
58 |
+
# try:
|
59 |
+
# model_info = API.model_info(repo_id=model, revision=revision)
|
60 |
+
# except Exception:
|
61 |
+
# return styled_error("Could not get your model information. Please fill it up properly.")
|
62 |
+
#
|
63 |
+
# model_size = get_model_size(model_info=model_info, precision=precision)
|
64 |
+
#
|
65 |
+
# # Were the model card and license filled?
|
66 |
+
# try:
|
67 |
+
# license = model_info.cardData["license"]
|
68 |
+
# except Exception:
|
69 |
+
# return styled_error("Please select a license for your model")
|
70 |
+
#
|
71 |
+
# modelcard_OK, error_msg = check_model_card(model)
|
72 |
+
# if not modelcard_OK:
|
73 |
+
# return styled_error(error_msg)
|
74 |
+
#
|
75 |
+
# # Seems good, creating the eval
|
76 |
+
# print("Adding new eval")
|
77 |
+
#
|
78 |
+
# eval_entry = {
|
79 |
+
# "model": model,
|
80 |
+
# "base_model": base_model,
|
81 |
+
# "revision": revision,
|
82 |
+
# "precision": precision,
|
83 |
+
# "weight_type": weight_type,
|
84 |
+
# "status": "PENDING",
|
85 |
+
# "submitted_time": current_time,
|
86 |
+
# "model_type": model_type,
|
87 |
+
# "likes": model_info.likes,
|
88 |
+
# "params": model_size,
|
89 |
+
# "license": license,
|
90 |
+
# "private": False,
|
91 |
+
# }
|
92 |
+
#
|
93 |
+
# # Check for duplicate submission
|
94 |
+
# if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
|
95 |
+
# return styled_warning("This model has been already submitted.")
|
96 |
+
#
|
97 |
+
# print("Creating eval file")
|
98 |
+
# OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
99 |
+
# os.makedirs(OUT_DIR, exist_ok=True)
|
100 |
+
# out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
101 |
+
#
|
102 |
+
# with open(out_path, "w") as f:
|
103 |
+
# f.write(json.dumps(eval_entry))
|
104 |
+
#
|
105 |
+
# print("Uploading eval file")
|
106 |
+
# API.upload_file(
|
107 |
+
# path_or_fileobj=out_path,
|
108 |
+
# path_in_repo=out_path.split("eval-queue/")[1],
|
109 |
+
# repo_id=QUEUE_REPO,
|
110 |
+
# repo_type="dataset",
|
111 |
+
# commit_message=f"Add {model} to eval queue",
|
112 |
+
# )
|
113 |
+
#
|
114 |
+
# # Remove the local file
|
115 |
+
# os.remove(out_path)
|
116 |
+
#
|
117 |
+
# return styled_message(
|
118 |
+
# "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
|
119 |
+
# )
|