pavlichenko commited on
Commit
fcf7e29
β€’
1 Parent(s): a084e60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -11
app.py CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
2
  import requests
3
  from collections import defaultdict
4
  import pandas as pd
 
5
 
6
 
7
  header = """Toloka compared and ranked LLM output in multiple categories, using Guanaco 13B as the baseline.
@@ -25,19 +26,23 @@ The alternative is to use open-source prompts, but they are not reliable enough
25
 
26
  To mitigate these issues, we collected organic prompts sent to ChatGPT (some were submitted by Toloka employees, and some we found on the internet, but all of them were from real conversations with ChatGPT). These prompts are the key to accurate evaluation β€” **we can be certain that the prompts represent real-world use cases, and they were not used in any LLM training sets.** We store the dataset securely and reserve it solely for use in this particular evaluation.
27
 
28
- After collecting the prompts, we manually classified them by category and got the following distribution:
29
 
30
- * Brainstorming: 15.48%
31
- * Chat: 1.59%
32
- * Classification: 0.2%
33
- * Closed QA: 3.77%
34
- * Extraction: 0.6%
35
- * Generation: 38.29%
36
- * Open QA: 32.94%
37
- * Rewrite: 5.16%
38
- * Summarization: 1.98%
39
 
40
- We intentionally excluded prompts about coding. If you are interested in comparing coding abilities, you can refer to specific benchmarks such as [HumanEval](https://paperswithcode.com/sota/code-generation-on-humaneval).
 
 
 
 
41
 
42
 
43
  #### 🧠 Stage 2: Human evaluation
@@ -142,6 +147,8 @@ st.dataframe(
142
  }
143
  )
144
  st.markdown(description)
 
 
145
  st.link_button('πŸš€ Evaluate my model', url='https://toloka.ai/talk-to-us/')
146
  prompt_examples = """
147
  ### πŸ” Prompt Examples
 
2
  import requests
3
  from collections import defaultdict
4
  import pandas as pd
5
+ import plotly.graph_objects as go
6
 
7
 
8
  header = """Toloka compared and ranked LLM output in multiple categories, using Guanaco 13B as the baseline.
 
26
 
27
  To mitigate these issues, we collected organic prompts sent to ChatGPT (some were submitted by Toloka employees, and some we found on the internet, but all of them were from real conversations with ChatGPT). These prompts are the key to accurate evaluation β€” **we can be certain that the prompts represent real-world use cases, and they were not used in any LLM training sets.** We store the dataset securely and reserve it solely for use in this particular evaluation.
28
 
29
+ After collecting the prompts, we manually classified them by category and got the following distribution:"""
30
 
31
+ # * Brainstorming: 15.48%
32
+ # * Chat: 1.59%
33
+ # * Classification: 0.2%
34
+ # * Closed QA: 3.77%
35
+ # * Extraction: 0.6%
36
+ # * Generation: 38.29%
37
+ # * Open QA: 32.94%
38
+ # * Rewrite: 5.16%
39
+ # * Summarization: 1.98%
40
 
41
+ fig = go.Figure(
42
+ data=[go.Bar(y=[38.29, 32.94, 15.48, 5.16, 3.77, 1.98, 1.59, 0.6, 0.2]), x=["Generation", "Open QA", "Brainstorming", "Rewrite", "Closed QA", "Summarization", "Chat", "Extraction", "Classification"]],
43
+ )
44
+
45
+ description2 = """We intentionally excluded prompts about coding. If you are interested in comparing coding abilities, you can refer to specific benchmarks such as [HumanEval](https://paperswithcode.com/sota/code-generation-on-humaneval).
46
 
47
 
48
  #### 🧠 Stage 2: Human evaluation
 
147
  }
148
  )
149
  st.markdown(description)
150
+ st.plotly_chart(fig, theme="streamlit")
151
+ st.markdown(description2)
152
  st.link_button('πŸš€ Evaluate my model', url='https://toloka.ai/talk-to-us/')
153
  prompt_examples = """
154
  ### πŸ” Prompt Examples