Spaces:
Runtime error
Runtime error
import os | |
import openai | |
from openai.embeddings_utils import get_embedding, cosine_similarity | |
from sklearn.manifold import TSNE | |
import streamlit as st | |
from matplotlib import cm | |
import pandas as pd | |
import numpy as np | |
from ast import literal_eval | |
import nomic | |
from nomic import atlas | |
import matplotlib.pyplot as plt | |
import matplotlib | |
import numpy as np | |
from dotenv import load_dotenv | |
load_dotenv() | |
MODEL = "text-embedding-ada-002" | |
st.set_page_config(page_title="Visual Embeddings and Similarity", page_icon="🤖", layout="wide") | |
# sidebar with openai api key and nomic token | |
st.sidebar.title("Credentials") | |
st.sidebar.write("OpenAI API Key") | |
openai_api_key = st.sidebar.text_input("Enter your OpenAI API Key", value=os.getenv("OPENAI_API_KEY")) | |
st.sidebar.write("Nomic Token") | |
nomic_token = st.sidebar.text_input("Enter your Nomic Token", value=os.getenv("NOMIC_TOKEN")) | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
nomic.login(os.getenv("NOMIC_TOKEN")) | |
# get data | |
datafile_path = "food_review.csv" | |
# show only columns ProductId, Score, Summary, Text, n_tokens, embedding | |
df = pd.read_csv(datafile_path, usecols=[0,1,3, 5, 7, 8]) | |
st.title("Visual Embeddings and Similarity") | |
st.write("Amazon food reviews dataset") | |
st.write(df) | |
st.write("Search similarity") | |
form = st.form('Embeddings') | |
question = form.text_input("Enter a sentence to search for semantic similarity", value="I love this soup") | |
btn = form.form_submit_button("Run") | |
if btn: | |
# si openai api key no es none y nomic token no es none | |
if openai_api_key is not None and nomic_token is not None: | |
with st.spinner("Loading"): | |
search_term_vector = get_embedding(question, engine="text-embedding-ada-002") | |
search_term_vector = np.array(search_term_vector) | |
matrix = np.array(df.embedding.apply(literal_eval).to_list()) | |
# Compute distances to the search_term_vector | |
distances = np.linalg.norm(matrix - search_term_vector, axis=1) | |
df['distance_to_search_term'] = distances | |
# Normalize the distances to range 0-1 for coloring | |
df['normalized_distance'] = (df['distance_to_search_term'] - df['distance_to_search_term'].min()) / (df['distance_to_search_term'].max() - df['distance_to_search_term'].min()) | |
# 2D visualization | |
# Create a t-SNE model and transform the data | |
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200) | |
vis_dims = tsne.fit_transform(matrix) | |
colors = cm.rainbow(df['normalized_distance']) | |
x = [x for x,y in vis_dims] | |
y = [y for x,y in vis_dims] | |
# Plot points with colors corresponding to their distance from search_term_vector | |
plt.scatter(x, y, color=colors, alpha=0.3) | |
# Set title and plot | |
plt.title("Similarity to search term visualized in language using t-SNE") | |
# Convert 'embedding' column to numpy arrays | |
df['embedding'] = df['embedding'].apply(lambda x: np.array(literal_eval(x))) | |
df["similarities"] = df['embedding'].apply(lambda x: cosine_similarity(x, search_term_vector)) | |
st.title("Visual embedding of the search term and the 20 most similar sentences") | |
#create two columns | |
col1, col2 = st.columns(2) | |
#col1 | |
#show st.plot in col1 | |
col1.pyplot(plt) | |
#col2 | |
#show df in col2, but only the columns, text and similarities | |
col2.write(df[['similarities','Text']].sort_values("similarities", ascending=False).head(20)) | |
# Convert to a list of lists of floats | |
st.title("Nomic mappping embeddings") | |
embeddings = np.array(df.embedding.to_list()) | |
df = df.drop('embedding', axis=1) | |
df = df.rename(columns={'Unnamed: 0': 'id'}) | |
data = df.to_dict('records') | |
project = atlas.map_embeddings(embeddings=embeddings, data=data, | |
id_field='id', | |
colorable_fields=['Score']) | |
# Convert project to a string before getting link information | |
project_str = str(project) | |
st.text(project_str) | |
# Split the project string at the colon and take the second part (index 1) | |
project_link = project_str.split(':', 1)[1] | |
# Trim any leading or trailing whitespace | |
project_link = project_link.strip() | |
# Crea un iframe con la URL y muéstralo con Streamlit | |
st.markdown(f'<iframe src="{project_link}" width="100%" height="600px"></iframe>', unsafe_allow_html=True) | |
else: | |
st.write("Please enter your OpenAI API Key and Nomic Token in the sidebar") | |