File size: 2,004 Bytes
8aac646
 
 
 
 
 
b19b634
64b11d2
b19b634
 
 
 
64b11d2
b19b634
64b11d2
b19b634
 
 
2865184
64b11d2
 
8aac646
 
b19b634
2865184
 
dab4dfa
2865184
dab4dfa
b19b634
2865184
18967bf
64b11d2
b19b634
8aac646
b19b634
c6c5724
 
 
 
 
8aac646
c6c5724
 
 
 
 
 
 
 
2b533cd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import streamlit as st
from datasets import load_dataset
import os 

HF_TOKEN = os.environ.get("HF_TOKEN", None)

st.set_page_config(page_title="FW Clusters inspection", layout="wide")
st.title("FW clusters inspection (free topics)")

st.markdown("""
We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering). 

Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10. 

Additionally, the model was tasked with finding the topic of each cluster. 
""")

@st.cache_data
def load_data(min=1, max=10):
    ds = load_dataset("HuggingFaceTB/FW_clusters_free_topics", split="train", token=HF_TOKEN, num_proc=2)
    ds = ds.filter(lambda x: x['educational_score'] <= max_score and x['educational_score'] >= min_score)
    return ds

st.subheader("Cluster information")
col_1, col_2 = st.columns(2)
with col_1:
    min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score')
with col_2:
    max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score')

ds = load_data(min_value, max_value)
selected_category_type = st.selectbox("Select a topic", categories)
categories = list(set(ds["category"]))
selected_cluster = ds.filter(lambda x: x['category'] == selected_category)

# Select sample index
n_samples = len(selected_cluster)
if n_samples > 0:
    col_1, col_2 = st.columns(2)
    with col_1:
        index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one",  min_value=0, max_value=len(selected_cluster)-1, value=0, step=1)

    files = selected_cluster[index_cluster]["examples"]

    with col_2:
        index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one",  min_value=0, max_value=len(files)-1, value=0, step=1)

    sample = files[index_example]
    st.markdown(sample)
else:
    st.markdown("No files found, change the cluster.")