File size: 2,790 Bytes
8aac646 bf9b80e b19b634 bf9b80e b19b634 bc78067 b19b634 bc78067 b19b634 85a8c20 b19b634 85a8c20 770946a 85a8c20 6881bc0 85a8c20 8aac646 b19b634 f5985dd 2865184 01206ed 2865184 01206ed 85a8c20 01206ed 85a8c20 64b11d2 01206ed b19b634 8aac646 b19b634 c6c5724 8aac646 c6c5724 2858b59 c6c5724 2b533cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import streamlit as st
from datasets import load_dataset
import os
HF_TOKEN = os.environ.get("HF_TOKEN", None)
st.set_page_config(page_title="Web Clusters inspection", layout="wide")
st.title("Web clusters inspection")
st.markdown("""
We clustered 100k web samples using [text-clustering](https://github.com/huggingface/text-clustering).
Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10. \
Technically, we provide it with 10 random examples from the cluster in the prompt and ask it to judge their topics.
Additionally, the model was tasked with finding the topic of each cluster (based on the 10 random examples).
""")
@st.cache_data
def load_data(min_score=1, max_score=10, show_special=False):
# HuggingFaceTB/FW_clusters_free_topics
ds = load_dataset("HuggingFaceTB/FW_clusters_100k_145_topics", split="train", token=HF_TOKEN, num_proc=2)
def filter_func(x):
try:
score = int(x['educational_score'])
value = False if show_special else min_score <= score <= max_score
return value
except (ValueError, TypeError):
# Return True if show_special is checked and educational_score is None or ''
return show_special
ds = ds.filter(filter_func)
return ds
st.subheader("Cluster information")
col_1, col_2, col_3 = st.columns(3)
with col_1:
show_special = st.checkbox('Show only clusters with undefined educational score', False)
with col_2:
min_value = st.slider('Select minimum educational score', 1, 10, 1, key='min_score')
with col_3:
max_value = st.slider('Select maximum educational score', 1, 10, 10, key='max_score')
# Load data based on slider values and checkbox status
ds = load_data(min_value, max_value, show_special)
categories = list(set(ds["category"]))
selected_category = st.selectbox("Select a topic", categories)
selected_cluster = ds.filter(lambda x: x['category'] == selected_category)
# Select sample index
n_samples = len(selected_cluster)
if n_samples > 0:
col_1, col_2 = st.columns(2)
with col_1:
index_cluster = st.number_input(f"Found {len(selected_cluster)} clusters, choose one", min_value=0, max_value=len(selected_cluster)-1, value=0, step=1)
files = selected_cluster[index_cluster]["examples"]
with col_2:
index_example = st.number_input(f"Found {len(files)} files in the cluster, choose one", min_value=0, max_value=len(files)-1, value=0, step=1)
sample = files[index_example]
st.markdown(f"**Educational score of the cluster**: {selected_cluster[index_cluster]['educational_score']}")
st.markdown(sample)
else:
st.markdown("No files found, change the cluster.") |