|
import streamlit as st |
|
from datasets import load_dataset |
|
import os |
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN", None) |
|
|
|
st.set_page_config(page_title="FW Clusters inspection", layout="wide") |
|
st.title("FW clusters inspection (free topics)") |
|
|
|
st.markdown(""" |
|
We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering). |
|
|
|
Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10. |
|
|
|
Additionally, the model was tasked with finding the topic of each cluster. |
|
""") |
|
|
|
@st.cache_data |
|
def load_data(min=1, max=10): |
|
ds = load_dataset("HuggingFaceTB/FW_clusters_free_topics", split="train", token=HF_TOKEN, num_proc=2) |
|
ds = ds.filter(lambda x: x['educational_score'] <= max_score and x['educational_score'] >= min_score) |
|
return ds |
|
|
|
st.subheader("Cluster information") |
|
col_1, col_2 = st.columns(2) |
|
with col_1: |
|
min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score') |
|
with col_2: |
|
max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score') |
|
|
|
ds = load_data(min_value, max_value) |
|
selected_category_type = st.selectbox("Select a topic", categories) |
|
categories = list(set(ds["category"])) |
|
selected_cluster = ds.filter(lambda x: x['category'] == selected_category) |
|
|
|
|
|
n_samples = len(selected_cluster) |
|
if n_samples > 0: |
|
col_1, col_2 = st.columns(2) |
|
with col_1: |
|
index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one", min_value=0, max_value=len(selected_cluster)-1, value=0, step=1) |
|
|
|
files = selected_cluster[index_cluster]["examples"] |
|
|
|
with col_2: |
|
index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one", min_value=0, max_value=len(files)-1, value=0, step=1) |
|
|
|
sample = files[index_example] |
|
st.markdown(sample) |
|
else: |
|
st.markdown("No files found, change the cluster.") |