Spaces:
Running
Running
import plotly.express as px | |
import streamlit as st | |
from sentence_transformers import SentenceTransformer | |
from huggingface_hub import hf_hub_url, cached_download | |
import umap.umap_ as umap | |
import pandas as pd | |
import os | |
import joblib | |
def init_models(): | |
model_name = 'sentence-transformers/all-MiniLM-L6-v2' | |
model = SentenceTransformer(model_name) | |
REPO_ID = "peter2000/umap_embed_3d_all-MiniLM-L6-v2" | |
FILENAME = "umap_embed_3d_all-MiniLM-L6-v2.sav" | |
umap_model= joblib.load(cached_download(hf_hub_url(REPO_ID, FILENAME))) | |
return model, umap_model | |
def app(): | |
with st.container(): | |
st.markdown("<h1 style='text-align: center; \ | |
color: black;'> Text Embedder</h1>", | |
unsafe_allow_html=True) | |
st.write(' ') | |
st.write(' ') | |
with st.expander("ℹ️ - About this app", expanded=True): | |
st.write( | |
""" | |
Information cartography - Get your word/phrase/sentence/paragraph embedded and visualized. | |
The (English) sentence-transformers model "all-MiniLM-L6-v2" maps sentences & paragraphs to a 384 dimensional dense vector space This is normally used for tasks like clustering or semantic search, but in this case, we use it to place your text to a 3D map. Before plotting, the dimension needs to be reduced to three so we can actually plot it, but preserve as much information as possible. For this, we use a technology called umap. The sentence transformer is context sensitive and works best with whole sentences, to account for that we extend your text with "The book is about <text>" if its less than 15 characters. | |
Simply put in your text and press EMBED, your examples will add up. You can use the category for different coloring. | |
""") | |
st.markdown("") | |
word_to_embed_list = st.session_state['embed_list'] | |
cat_list = st.session_state['cat_list'] | |
with st.container(): | |
col1, col2 = st.columns(2) | |
with col1: | |
word_to_embed= st.text_input("Please enter your text here and we will embed it for you.", value="",) | |
with col2: | |
cat= st.selectbox('Category', ('1', '2', '3', '4', '5')) | |
if st.button("Embed"): | |
with st.spinner("👑 Embedding your input"): | |
model, umap_model = init_models() | |
word_to_embed_list.append(word_to_embed) | |
st.session_state['embed_list'] = word_to_embed_list | |
cat_list .append(cat) | |
st.session_state['cat_list '] = cat_list | |
phrase_to_embed = ["The book is about "+ wte for wte in word_to_embed_list if len(wte) <15] | |
examples_embeddings = model.encode(phrase_to_embed) | |
examples_umap = umap_model.transform(examples_embeddings) | |
#st.write(len(examples_umap)) | |
with st.spinner("👑 create visualisation"): | |
fig = px.scatter_3d( | |
examples_umap[1:] , x=0, y=1, z=2, | |
color=cat_list[1:] , | |
opacity = .7, hover_data=[word_to_embed_list[1:]]) | |
fig.update_scenes(xaxis_visible=False, yaxis_visible=False,zaxis_visible=False ) | |
fig.update_traces(marker_size=4) | |
st.plotly_chart(fig) |