Alcime commited on
Commit
3f7c271
·
1 Parent(s): 1b0a5d8

Edit app.py to add textual content

Browse files
Files changed (1) hide show
  1. app.py +10 -3
app.py CHANGED
@@ -10,19 +10,26 @@ st.sidebar.write(
10
  )
11
  st.sidebar.title("Dataset")
12
  st.sidebar.write(
13
- "We used a subset of Wikipedia dataset: https://huggingface.co/datasets/OpenAssistant/oasst2"
14
  )
15
 
16
  st.title("How to understand large textual datasets?")
17
-
 
 
18
  df = pd.read_csv("data/data_sample.csv", index_col=[0])
19
  df = df[["message_id", "text"]]
20
  df = df.head(300)
21
  st.dataframe(df, use_container_width=True)
22
  st.title("Inside the OASST2 dataset")
23
- element = open("images/map_prompt.html", "r", encoding="utf-8")
 
24
 
25
  components.html(element.read(), height=900, width=900)
 
 
 
 
26
 
27
  st.title("Some insights by territory")
28
  df_info = pd.read_csv("data/topics_info.csv", index_col=[0])
 
10
  )
11
  st.sidebar.title("Dataset")
12
  st.sidebar.write(
13
+ "We used a subset of the Open Assistant 2 dataset: https://huggingface.co/datasets/OpenAssistant/oasst2"
14
  )
15
 
16
  st.title("How to understand large textual datasets?")
17
+ st.info(
18
+ "We sampled every prompt from the English subset of the oasst2 dataset. Here is a sample:"
19
+ )
20
  df = pd.read_csv("data/data_sample.csv", index_col=[0])
21
  df = df[["message_id", "text"]]
22
  df = df.head(300)
23
  st.dataframe(df, use_container_width=True)
24
  st.title("Inside the OASST2 dataset")
25
+ element = open("images/map_prompt.html", "r", encoding="utf-8",
26
+ caption="This mapping can be extended to include the assistant answers, and the prompts can be selected on a topic basis through the python package, allowing to filter and curate the data.")
27
 
28
  components.html(element.read(), height=900, width=900)
29
+ st.info(
30
+ "The different clusters allow to explore the main topics evoked by the prompt. For instance, in the blink of an eye, one may see which topics are dealt with and which topics are lacking content. This visualisation can therefore be used as a stepping stone to investigate bias or content, providing resources to fuel the discussion open by the OASS paper available here : https://drive.google.com/file/d/10iR5hKwFqAKhL3umx8muOWSRm7hs5FqX/view"
31
+ )
32
+
33
 
34
  st.title("Some insights by territory")
35
  df_info = pd.read_csv("data/topics_info.csv", index_col=[0])