inspect_web_clusters

Running

File size: 2,004 Bytes

8aac646
 
 
 
 
 
b19b634
64b11d2
b19b634
 
 
 
64b11d2
b19b634
64b11d2
b19b634
 
 
2865184
64b11d2
 
8aac646
 
b19b634
2865184
 
dab4dfa
2865184
dab4dfa
b19b634
2865184
18967bf
64b11d2
b19b634
8aac646
b19b634
c6c5724
 
 
 
 
8aac646
c6c5724
 
 
 
 
 
 
 
2b533cd

import streamlit as st
from datasets import load_dataset
import os 

HF_TOKEN = os.environ.get("HF_TOKEN", None)

st.set_page_config(page_title="FW Clusters inspection", layout="wide")
st.title("FW clusters inspection (free topics)")

st.markdown("""
We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering). 

Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10. 

Additionally, the model was tasked with finding the topic of each cluster. 
""")

@st.cache_data
def load_data(min=1, max=10):
    ds = load_dataset("HuggingFaceTB/FW_clusters_free_topics", split="train", token=HF_TOKEN, num_proc=2)
    ds = ds.filter(lambda x: x['educational_score'] <= max_score and x['educational_score'] >= min_score)
    return ds

st.subheader("Cluster information")
col_1, col_2 = st.columns(2)
with col_1:
    min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score')
with col_2:
    max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score')

ds = load_data(min_value, max_value)
selected_category_type = st.selectbox("Select a topic", categories)
categories = list(set(ds["category"]))
selected_cluster = ds.filter(lambda x: x['category'] == selected_category)

# Select sample index
n_samples = len(selected_cluster)
if n_samples > 0:
    col_1, col_2 = st.columns(2)
    with col_1:
        index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one",  min_value=0, max_value=len(selected_cluster)-1, value=0, step=1)

    files = selected_cluster[index_cluster]["examples"]

    with col_2:
        index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one",  min_value=0, max_value=len(files)-1, value=0, step=1)

    sample = files[index_example]
    st.markdown(sample)
else:
    st.markdown("No files found, change the cluster.")