Spaces:
Sleeping
Sleeping
XufengDuan
commited on
Commit
·
ce1e7cd
1
Parent(s):
27d8f5d
update scripts
Browse files- src/backend/model_operations.py +22 -16
- src/display/about.py +13 -1
src/backend/model_operations.py
CHANGED
@@ -394,7 +394,20 @@ class ResponseGenerator:
|
|
394 |
result = result.replace(prompt[0], '')
|
395 |
print(result)
|
396 |
return result
|
397 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
|
399 |
elif self.local_model is None:
|
400 |
import random
|
@@ -474,20 +487,7 @@ class ResponseGenerator:
|
|
474 |
result = convo.last.text
|
475 |
print(result)
|
476 |
return result
|
477 |
-
|
478 |
-
elif 'gpt' in self.model_id.lower():
|
479 |
-
response = litellm.completion(
|
480 |
-
model=self.model_id.replace('openai/',''),
|
481 |
-
messages=[{"role": "system", "content": system_prompt},
|
482 |
-
{"role": "user", "content": user_prompt}],
|
483 |
-
# temperature=0.0,
|
484 |
-
max_tokens=100,
|
485 |
-
api_key = os.getenv('OpenAI_key')
|
486 |
-
)
|
487 |
-
result = response['choices'][0]['message']['content']
|
488 |
-
# print()
|
489 |
-
print(result)
|
490 |
-
return result
|
491 |
# exit()
|
492 |
# Using local model
|
493 |
|
@@ -640,7 +640,7 @@ class EvaluationModel:
|
|
640 |
filtered_lines.insert(0, lines[0])
|
641 |
else:
|
642 |
filtered_lines = lines
|
643 |
-
print(filtered_lines)
|
644 |
|
645 |
filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
|
646 |
rs = "\n".join(filtered_lines)
|
@@ -884,6 +884,7 @@ class EvaluationModel:
|
|
884 |
human_e5 = create_e5_entries(human_df)
|
885 |
llm_e5 = create_e5_entries(llm_df)
|
886 |
|
|
|
887 |
# Remove E5 and E51 entries from both datasets
|
888 |
human_df = human_df[~human_df['Question_ID'].str.contains('E5')]
|
889 |
llm_df = llm_df[~llm_df['Question_ID'].str.contains('E5')]
|
@@ -895,10 +896,15 @@ class EvaluationModel:
|
|
895 |
|
896 |
### Calculate Average JS Divergence ###
|
897 |
|
|
|
898 |
# Extract the relevant columns for JS divergence calculation
|
899 |
human_responses = human_df[['Question_ID', 'Coding']]
|
900 |
llm_responses = llm_df[['Question_ID', 'Coding']]
|
901 |
|
|
|
|
|
|
|
|
|
902 |
# Get unique Question_IDs present in both datasets
|
903 |
common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
|
904 |
|
|
|
394 |
result = result.replace(prompt[0], '')
|
395 |
print(result)
|
396 |
return result
|
397 |
+
# Using OpenAI API
|
398 |
+
elif 'gpt' in self.model_id.lower():
|
399 |
+
response = litellm.completion(
|
400 |
+
model=self.model_id.replace('openai/', ''),
|
401 |
+
messages=[{"role": "system", "content": system_prompt},
|
402 |
+
{"role": "user", "content": user_prompt}],
|
403 |
+
# temperature=0.0,
|
404 |
+
max_tokens=100,
|
405 |
+
api_key=os.getenv('OpenAI_key')
|
406 |
+
)
|
407 |
+
result = response['choices'][0]['message']['content']
|
408 |
+
# print()
|
409 |
+
# print(result)
|
410 |
+
return result
|
411 |
|
412 |
elif self.local_model is None:
|
413 |
import random
|
|
|
487 |
result = convo.last.text
|
488 |
print(result)
|
489 |
return result
|
490 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
491 |
# exit()
|
492 |
# Using local model
|
493 |
|
|
|
640 |
filtered_lines.insert(0, lines[0])
|
641 |
else:
|
642 |
filtered_lines = lines
|
643 |
+
# print(filtered_lines)
|
644 |
|
645 |
filtered_lines = [r.split('-', 1)[-1].strip() if '-' in r else r for r in filtered_lines]
|
646 |
rs = "\n".join(filtered_lines)
|
|
|
884 |
human_e5 = create_e5_entries(human_df)
|
885 |
llm_e5 = create_e5_entries(llm_df)
|
886 |
|
887 |
+
|
888 |
# Remove E5 and E51 entries from both datasets
|
889 |
human_df = human_df[~human_df['Question_ID'].str.contains('E5')]
|
890 |
llm_df = llm_df[~llm_df['Question_ID'].str.contains('E5')]
|
|
|
896 |
|
897 |
### Calculate Average JS Divergence ###
|
898 |
|
899 |
+
|
900 |
# Extract the relevant columns for JS divergence calculation
|
901 |
human_responses = human_df[['Question_ID', 'Coding']]
|
902 |
llm_responses = llm_df[['Question_ID', 'Coding']]
|
903 |
|
904 |
+
# Remove 'Other' responses
|
905 |
+
human_responses = human_responses[human_responses['Coding'] != 'Other']
|
906 |
+
llm_responses = llm_responses[llm_responses['Coding'] != 'Other']
|
907 |
+
|
908 |
# Get unique Question_IDs present in both datasets
|
909 |
common_question_ids = set(human_responses['Question_ID']).intersection(set(llm_responses['Question_ID']))
|
910 |
|
src/display/about.py
CHANGED
@@ -40,7 +40,19 @@ TITLE = """<h1 align="center" id="space-title">Humanlike Evaluation Model (HEM)
|
|
40 |
|
41 |
# What does your leaderboard evaluate?
|
42 |
INTRODUCTION_TEXT = """
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
"""
|
45 |
|
46 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
40 |
|
41 |
# What does your leaderboard evaluate?
|
42 |
INTRODUCTION_TEXT = """
|
43 |
+
Welcome to the Humanlikeness Leaderboard, curated by [Xufeng Duan](https://xufengduan.github.io/). This platform rigorously evaluates the alignment between human and model responses in language processing, utilizing ten carefully designed psycholinguistic tasks to quantify a model's humanlikeness:<br><br>
|
44 |
+
1. **Sounds:** Sound Shape Association<br>
|
45 |
+
2. **Sounds:** Sound Gender Association<br>
|
46 |
+
3. **Word:** Word Length and Predictivity<br>
|
47 |
+
4. **Word:** Word Meaning Priming<br>
|
48 |
+
5. **Syntax:** Structural Priming<br>
|
49 |
+
6. **Syntax:** Syntactic Ambiguity Resolution<br>
|
50 |
+
7. **Meaning:** Implausible Sentence Interpretation<br>
|
51 |
+
8. **Meaning:** Semantic Illusion<br>
|
52 |
+
9. **Discourse:** Implicit Causality<br>
|
53 |
+
10. **Discourse:** Drawing Inferences<br><br>
|
54 |
+
Each task is composed of multiple stimuli, designed to elicit both expected and unexpected responses. We have gathered data from 2000 human participants, generating response distributions that reflect natural human behavior across these tasks. By presenting identical stimuli to advanced language models, we generate corresponding response distributions for comparison.<br><br>
|
55 |
+
The degree of congruence between these human and model distributions offers a precise measure of the model's humanlikeness.<br>
|
56 |
"""
|
57 |
|
58 |
# Which evaluations are you running? how can people reproduce what you have?
|