inspect_web_clusters

Running

App Files Files Community

inspect_web_clusters / app.py

loubnabnl HF staff

Update app.py

bc78067 verified about 1 year ago

raw

history blame

2.76 kB

	import streamlit as st
	from datasets import load_dataset
	import os

	HF_TOKEN = os.environ.get("HF_TOKEN", None)

	st.set_page_config(page_title="FW Clusters inspection", layout="wide")
	st.title("FW clusters inspection (free topics)")

	st.markdown("""
	We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering).

	Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10. \
	Technically, we provide it with 10 random examples from the cluster in the prompt and ask it to judge their topics.

	Additionally, the model was tasked with finding the topic of each cluster (based on the 10 random examples).
	""")


	@st.cache_data
	def load_data(min_score=1, max_score=10, show_special=False):
	ds = load_dataset("HuggingFaceTB/FW_clusters_free_topics", split="train", token=HF_TOKEN, num_proc=2)
	def filter_func(x):
	try:
	score = int(x['educational_score'])
	value = False if show_special else min_score <= score <= max_score
	return value
	except (ValueError, TypeError):
	# Return True if show_special is checked and educational_score is None or ''
	return show_special

	ds = ds.filter(filter_func)
	return ds

	st.subheader("Cluster information")
	col_1, col_2, col_3 = st.columns(3)
	with col_1:
	show_special = st.checkbox('Show only clusters with undefined educational score', False)
	with col_2:
	min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score')
	with col_3:
	max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score')

	# Load data based on slider values and checkbox status
	ds = load_data(min_value, max_value, show_special)
	categories = list(set(ds["category"]))
	selected_category = st.selectbox("Select a topic", categories)
	selected_cluster = ds.filter(lambda x: x['category'] == selected_category)

	# Select sample index
	n_samples = len(selected_cluster)
	if n_samples > 0:
	col_1, col_2 = st.columns(2)
	with col_1:
	index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one", min_value=0, max_value=len(selected_cluster)-1, value=0, step=1)

	files = selected_cluster[index_cluster]["examples"]

	with col_2:
	index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one", min_value=0, max_value=len(files)-1, value=0, step=1)

	sample = files[index_example]
	st.markdown(f"Educational score of the cluster: {selected_cluster[index_cluster]['educational_score']}")
	st.markdown(sample)
	else:
	st.markdown("No files found, change the cluster.")