import plotly.express as px import streamlit as st from sentence_transformers import SentenceTransformer from huggingface_hub import hf_hub_url, cached_download import umap.umap_ as umap import pandas as pd import os import joblib def init_models(): model_name = 'sentence-transformers/all-MiniLM-L6-v2' model = SentenceTransformer(model_name) REPO_ID = "peter2000/umap_embed_3d_all-MiniLM-L6-v2" FILENAME = "umap_embed_3d_all-MiniLM-L6-v2.sav" umap_model= joblib.load(cached_download(hf_hub_url(REPO_ID, FILENAME))) return model, umap_model def app(): with st.container(): st.markdown("

Text Embedder

", unsafe_allow_html=True) st.write(' ') st.write(' ') with st.expander("ℹī¸ - About this app", expanded=True): st.write( """ Information cartography - Get your word/phrase/sentence/paragraph embedded and visualized. The (English) sentence-transformers model "all-MiniLM-L6-v2" maps sentences & paragraphs to a 384 dimensional dense vector space This is normally used for tasks like clustering or semantic search, but in this case, we use it to place your text to a 3D map. Before plotting, the dimension needs to be reduced to three so we can actually plot it, but preserve as much information as possible. For this, we use a technology called umap. The sentence transformer is context sensitive and works best with whole sentences, to account for that we extend your text with "The book is about " if its less than 15 characters. Simply put in your text and press EMBED, your examples will add up. You can use the category for different coloring. """) st.markdown("") word_to_embed_list = st.session_state['embed_list'] cat_list = st.session_state['cat_list'] with st.container(): col1, col2 = st.columns(2) with col1: word_to_embed= st.text_input("Please enter your text here and we will embed it for you.", value="",) with col2: cat= st.selectbox('Category', ('1', '2', '3', '4', '5')) if st.button("Embed"): with st.spinner("👑 Embedding your input"): model, umap_model = init_models() word_to_embed_list.append(word_to_embed) st.session_state['embed_list'] = word_to_embed_list cat_list .append(cat) st.session_state['cat_list '] = cat_list phrase_to_embed = ["The book is about "+ wte for wte in word_to_embed_list if len(wte) <15] examples_embeddings = model.encode(phrase_to_embed) examples_umap = umap_model.transform(examples_embeddings) #st.write(len(examples_umap)) with st.spinner("👑 create visualisation"): fig = px.scatter_3d( examples_umap[1:] , x=0, y=1, z=2, color=cat_list[1:] , opacity = .7, hover_data=[word_to_embed_list[1:]]) fig.update_scenes(xaxis_visible=False, yaxis_visible=False,zaxis_visible=False ) fig.update_traces(marker_size=4) st.plotly_chart(fig)