import streamlit as st from datasets import load_dataset import os HF_TOKEN = os.environ.get("HF_TOKEN", None) st.set_page_config(page_title="FW Clusters inspection", layout="wide") st.title("FW clusters inspection (free topics)") st.markdown(""" We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering). Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10. Additionally, the model was tasked with finding the topic of each cluster. """) @st.cache_data def load_data(min_score=1, max_score=10, show_special=False): ds = load_dataset("HuggingFaceTB/FW_clusters_free_topics", split="train", token=HF_TOKEN, num_proc=2) def filter_func(x): try: score = int(x['educational_score']) return max(min_score <= score <= max_score, show_special) except (ValueError, TypeError): # Return True if show_special is checked and educational_score is None or '' return show_special ds = ds.filter(filter_func) return ds st.subheader("Cluster information") col_1, col_2, col_3 = st.columns(2) with col_1: min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score') with col_2: max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score') with col_3: show_special = st.checkbox('Show clusters with undefined educational score', False) # Load data based on slider values and checkbox status ds = load_data(min_value, max_value, show_special) selected_category_type = st.selectbox("Select a topic", categories) categories = list(set(ds["category"])) selected_cluster = ds.filter(lambda x: x['category'] == selected_category) # Select sample index n_samples = len(selected_cluster) if n_samples > 0: col_1, col_2 = st.columns(2) with col_1: index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one", min_value=0, max_value=len(selected_cluster)-1, value=0, step=1) files = selected_cluster[index_cluster]["examples"] with col_2: index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one", min_value=0, max_value=len(files)-1, value=0, step=1) sample = files[index_example] st.markdown(sample) else: st.markdown("No files found, change the cluster.")