Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -41,14 +41,28 @@ def calculate_distances(embeddings, query_embedding, metric):
|
|
41 |
distances = -dot_similarity(embeddings, [query_embedding]) # Negated for consistency with other metrics
|
42 |
return distances.flatten()
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
def generate_plotly_figure(query, pdf_file, metric):
|
45 |
logging.info("Generating plot with Plotly.")
|
46 |
query_embedding = model.encode([query])[0]
|
47 |
text = process_pdf(pdf_file.name)
|
48 |
embeddings, sentences = create_embeddings(text)
|
|
|
|
|
|
|
|
|
|
|
49 |
all_embeddings = np.vstack([embeddings, query_embedding])
|
50 |
-
|
51 |
-
|
52 |
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
|
53 |
umap_embeddings = umap_transform.fit_transform(all_embeddings)
|
54 |
|
@@ -59,10 +73,10 @@ def generate_plotly_figure(query, pdf_file, metric):
|
|
59 |
colors.append('red') # For the query
|
60 |
|
61 |
fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
|
62 |
-
marker=dict(color=colors[:-1]), text=
|
63 |
-
name='Sentences'))
|
64 |
fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
|
65 |
-
marker=dict(color='red'), text=[
|
66 |
fig.update_layout(title="UMAP Projection of Sentences with Query Highlight", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
|
67 |
|
68 |
logging.info("Plotly figure created successfully.")
|
@@ -82,7 +96,7 @@ iface = gr.Interface(
|
|
82 |
gr.Radio(choices=["cosine", "euclidean", "manhattan", "dot"], label="Distance Metric")
|
83 |
],
|
84 |
outputs=gr.Plot(),
|
85 |
-
title="
|
86 |
description="""This tool allows you to upload a PDF document, input a query, and visualize the context of the document
|
87 |
as it relates to your query. It uses UMAP for dimensionality reduction and highlights the query and its closest contexts
|
88 |
within the document based on the selected distance metric. Choose from cosine, Euclidean, Manhattan, or dot product metrics
|
|
|
41 |
distances = -dot_similarity(embeddings, [query_embedding]) # Negated for consistency with other metrics
|
42 |
return distances.flatten()
|
43 |
|
44 |
+
def wrap_text(text, width=40):
|
45 |
+
"""
|
46 |
+
Inserts HTML line breaks for Plotly hover text.
|
47 |
+
:param text: The text to wrap.
|
48 |
+
:param width: The maximum line width before wrapping.
|
49 |
+
:return: Text with line breaks inserted.
|
50 |
+
"""
|
51 |
+
wrapped_text = '<br>'.join([text[i:i+width] for i in range(0, len(text), width)])
|
52 |
+
return wrapped_text
|
53 |
+
|
54 |
def generate_plotly_figure(query, pdf_file, metric):
|
55 |
logging.info("Generating plot with Plotly.")
|
56 |
query_embedding = model.encode([query])[0]
|
57 |
text = process_pdf(pdf_file.name)
|
58 |
embeddings, sentences = create_embeddings(text)
|
59 |
+
|
60 |
+
# Wrap text for each sentence
|
61 |
+
sentences_wrapped = [wrap_text(sentence) for sentence in sentences]
|
62 |
+
all_sentences_wrapped = sentences_wrapped + [wrap_text(query)] # Apply wrapping to the query as well
|
63 |
+
|
64 |
all_embeddings = np.vstack([embeddings, query_embedding])
|
65 |
+
|
|
|
66 |
umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
|
67 |
umap_embeddings = umap_transform.fit_transform(all_embeddings)
|
68 |
|
|
|
73 |
colors.append('red') # For the query
|
74 |
|
75 |
fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
|
76 |
+
marker=dict(color=colors[:-1]), text=all_sentences_wrapped[:-1],
|
77 |
+
name='Sentences', hoverinfo='text'))
|
78 |
fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
|
79 |
+
marker=dict(color='red'), text=[all_sentences_wrapped[-1]], name='Query', hoverinfo='text'))
|
80 |
fig.update_layout(title="UMAP Projection of Sentences with Query Highlight", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
|
81 |
|
82 |
logging.info("Plotly figure created successfully.")
|
|
|
96 |
gr.Radio(choices=["cosine", "euclidean", "manhattan", "dot"], label="Distance Metric")
|
97 |
],
|
98 |
outputs=gr.Plot(),
|
99 |
+
title="Semantic Search Visualizer",
|
100 |
description="""This tool allows you to upload a PDF document, input a query, and visualize the context of the document
|
101 |
as it relates to your query. It uses UMAP for dimensionality reduction and highlights the query and its closest contexts
|
102 |
within the document based on the selected distance metric. Choose from cosine, Euclidean, Manhattan, or dot product metrics
|