sunitha98 commited on
Commit
6f72df1
1 Parent(s): 943f952

update display

Browse files
Files changed (1) hide show
  1. src/display/about.py +29 -5
src/display/about.py CHANGED
@@ -11,24 +11,48 @@ class Task:
11
  # Init: to update with your specific keys
12
  class Tasks(Enum):
13
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
- task0 = Task("task_name1", "metric_name", "First task")
15
- task1 = Task("task_name2", "metric_name", "Second task")
 
 
 
 
16
 
17
 
18
  # Your leaderboard name
19
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
20
 
21
  # What does your leaderboard evaluate?
22
  INTRODUCTION_TEXT = """
23
- Intro text
24
  """
25
 
26
  # Which evaluations are you running? how can people reproduce what you have?
27
  LLM_BENCHMARKS_TEXT = f"""
28
  ## How it works
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  ## Reproducibility
31
- To reproduce our results, here is the commands you can run:
 
 
 
 
32
 
33
  """
34
 
 
11
  # Init: to update with your specific keys
12
  class Tasks(Enum):
13
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
14
+ task0 = Task("finance_bench", "accuracy", "FinanceBench")
15
+ task1 = Task("legal_confidentiality", "accuracy", "Legal Confidentiality")
16
+ task2 = Task("writing-prompts", "coherence", "Writing Prompts")
17
+ task3 = Task("customer-support", "engagement", "Customer Support Dialogue")
18
+ task4 = Task("toxic-prompts", "toxicity", "Toxic Prompts")
19
+ task5 = Task("enterprise-pii", "accuracy", "Enterprise PII")
20
 
21
 
22
  # Your leaderboard name
23
+ TITLE = """<h1 align="center" id="space-title">Patronus AI leaderboard</h1>"""
24
 
25
  # What does your leaderboard evaluate?
26
  INTRODUCTION_TEXT = """
27
+ This leaderboard evaluates the performance of models on real-world enterprise use cases.
28
  """
29
 
30
  # Which evaluations are you running? how can people reproduce what you have?
31
  LLM_BENCHMARKS_TEXT = f"""
32
  ## How it works
33
 
34
+ ## Tasks
35
+ 1. FinanceBench: The task measures the ability to answer financial questions given the context.
36
+
37
+ 2. Legal Confidentiality: The task measures the ability of LLMs to reason over legal causes. The model is prompted
38
+ to return yes/no as an answer to the question.
39
+
40
+ 3. Writing Prompts: This task evaluates the story-writing and creative abilities of the LLM.
41
+
42
+ 4. Customer Support Dialogue: This task evaluates the ability of the LLM to answer a customer support question
43
+ given some product information and conversational history.
44
+
45
+ 5. Toxic Prompts: This task evaluates the safety of the model by using prompts that can elicit harmful information
46
+ from LLMs.
47
+
48
+ 6. Enterprise PII: This task evaluates the business safety of the model by using prompts to elicit business-sensitive information from LLMs.
49
+
50
  ## Reproducibility
51
+ All of our datasets are closed-source. We provide a validation set with 5 examples for each of the tasks.
52
+
53
+ To reproduce the results on the validation set, run:
54
+
55
+
56
 
57
  """
58