inspect_web_clusters

Running

App Files Files Community

loubnabnl HF staff commited on Jan 23, 2024

Commit

64b11d2

verified ·

1 Parent(s): 94366ef

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -25

app.py CHANGED Viewed

@@ -5,45 +5,34 @@ import os
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 st.set_page_config(page_title="FW Clusters inspection", layout="wide")
-st.title("FW clusters inspection (on AFAIK topics)")
 st.markdown("""
 We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering).
-Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material.
-Additionally, the model was tasked with assigning a category to each cluster from 23 predefined categories found in [AFAIK](https://afaik.io/).
-Sometimes, the model may define its own category. This can happen either within the context of AFAIK topics or seperately. Hence the `Select Category Type` dropdown in our interface.
 """)
 @st.cache_data
-def load_data(educational_topic):
-    ds = load_dataset("HuggingFaceTB/FW_clusters_under_afaik_topics", split="train", token=HF_TOKEN, num_proc=2)
-    if educational_topic in ['Yes', 'No']:
-        ds = ds.filter(lambda x: x['is_topic_educational'] == educational_topic)
     return ds
-@st.cache_data
-def get_categories_by_type(_ds, category_type):
-    filtered_ds = _ds.filter(lambda x: x['category_type'] == category_type)
-    return list(set(filtered_ds['category']))
 st.subheader("Cluster information")
-col_1, col_2, col_3 = st.columns(3)
-with col_1:
-    educational_topic = st.selectbox('Are the topics deemed educational by the LLM?', ["Yes", "No"])
-ds = load_data(educational_topic)
-with col_2:
-    category_types = ['afaik', 'defined_by_llm', 'defined_by_llm_under_afaik']
-    default_index = 0 if educational_topic == "Yes" else 1
-    selected_category_type = st.selectbox("Select Category Type", category_types, index=default_index)
-with col_3:
-    categories = get_categories_by_type(ds, selected_category_type)
-    selected_category = st.selectbox("Select Category", categories)
 selected_cluster = ds.filter(lambda x: x['category'] == selected_category)

 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 st.set_page_config(page_title="FW Clusters inspection", layout="wide")
+st.title("FW clusters inspection (free topics)")
 st.markdown("""
 We clustered 100k FineWeb samples using [text-clustering](https://github.com/huggingface/text-clustering).
+Our approach involved prompting Mixtral to evaluate whether the topics in each cluster are educational or could be considered college material using a score from 1 to 10.
+Additionally, the model was tasked with finding the topic of each cluster.
 """)
 @st.cache_data
+def load_data(educational_topic, min_score=1, max_score=10):
+    ds = load_dataset("HuggingFaceTB/FW_clusters_free_topics", split="train", token=HF_TOKEN, num_proc=2)
+    ds = ds.filter(lambda x: x['educational_score'] <= max_score and x['educational_score'] >= min_score)
     return ds
 st.subheader("Cluster information")
+min_score, max_score = st.columns(2)
+with min_score:
+    min_value = st.slider('Select minimum educational score', 1, 10, 1)
+with max_score:
+    max_value = st.slider('Select maximum educational score', 1, 10, 10)
+ds = load_data(educational_topic, min_score, max_score)
+categories = list(set(ds["category"]))
+selected_category_type = st.selectbox("Select a topic", categories)
 selected_cluster = ds.filter(lambda x: x['category'] == selected_category)