peter2000's picture
Update apps/intro.py
90dfdac
raw
history blame
3.53 kB
import plotly.express as px
import streamlit as st
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_url, cached_download
import umap.umap_ as umap
import pandas as pd
import os
import joblib
def init_models():
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
REPO_ID = "peter2000/umap_embed_3d_all-MiniLM-L6-v2"
FILENAME = "umap_embed_3d_all-MiniLM-L6-v2.sav"
umap_model= joblib.load(cached_download(hf_hub_url(REPO_ID, FILENAME)))
return model, umap_model
def app():
with st.container():
st.markdown("<h1 style='text-align: center; \
color: black;'> Text Embedder</h1>",
unsafe_allow_html=True)
st.write(' ')
st.write(' ')
with st.expander("ℹ️ - About this app", expanded=True):
st.write(
"""
Information cartography - Get your word/phrase/sentence/paragraph embedded and visualized.
The (English) sentence-transformers model "all-MiniLM-L6-v2" maps sentences & paragraphs to a 384 dimensional dense vector space This is normally used for tasks like clustering or semantic search, but in this case, we use it to place your text to a 3D map. Before plotting, the dimension needs to be reduced to three so we can actually plot it, but preserve as much information as possible. For this, we use a technology called umap. The sentence transformer is context sensitive and works best with whole sentences, to account for that we extend your text with "The book is about <text>" if its less than 15 characters.
Simply put in your text and press EMBED, your examples will add up. You can use the category for different coloring.
""")
st.markdown("")
word_to_embed_list = st.session_state['embed_list']
cat_list = st.session_state['cat_list']
with st.container():
col1, col2 = st.columns(2)
with col1:
word_to_embed= st.text_input("Please enter your text here and we will embed it for you.", value="",)
with col2:
cat= st.selectbox('Category', ('1', '2', '3', '4', '5'))
if st.button("Embed"):
with st.spinner("👑 Embedding your input"):
model, umap_model = init_models()
word_to_embed_list.append(word_to_embed)
st.session_state['embed_list'] = word_to_embed_list
cat_list .append(cat)
st.session_state['cat_list '] = cat_list
phrase_to_embed = ["The book is about "+ wte for wte in word_to_embed_list if len(wte) <15]
examples_embeddings = model.encode(phrase_to_embed)
examples_umap = umap_model.transform(examples_embeddings)
#st.write(len(examples_umap))
with st.spinner("👑 create visualisation"):
fig = px.scatter_3d(
examples_umap[1:] , x=0, y=1, z=2,
color=cat_list[1:] ,
opacity = .7, hover_data=[word_to_embed_list[1:]])
fig.update_scenes(xaxis_visible=False, yaxis_visible=False,zaxis_visible=False )
fig.update_traces(marker_size=4)
st.plotly_chart(fig)