umarigan commited on
Commit
78ef349
·
verified ·
1 Parent(s): cb83ba5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -6
app.py CHANGED
@@ -41,14 +41,28 @@ def calculate_distances(embeddings, query_embedding, metric):
41
  distances = -dot_similarity(embeddings, [query_embedding]) # Negated for consistency with other metrics
42
  return distances.flatten()
43
 
 
 
 
 
 
 
 
 
 
 
44
  def generate_plotly_figure(query, pdf_file, metric):
45
  logging.info("Generating plot with Plotly.")
46
  query_embedding = model.encode([query])[0]
47
  text = process_pdf(pdf_file.name)
48
  embeddings, sentences = create_embeddings(text)
 
 
 
 
 
49
  all_embeddings = np.vstack([embeddings, query_embedding])
50
- all_sentences = sentences + [query]
51
-
52
  umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
53
  umap_embeddings = umap_transform.fit_transform(all_embeddings)
54
 
@@ -59,10 +73,10 @@ def generate_plotly_figure(query, pdf_file, metric):
59
  colors.append('red') # For the query
60
 
61
  fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
62
- marker=dict(color=colors[:-1]), text=all_sentences[:-1],
63
- name='Sentences'))
64
  fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
65
- marker=dict(color='red'), text=[query], name='Query'))
66
  fig.update_layout(title="UMAP Projection of Sentences with Query Highlight", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
67
 
68
  logging.info("Plotly figure created successfully.")
@@ -82,7 +96,7 @@ iface = gr.Interface(
82
  gr.Radio(choices=["cosine", "euclidean", "manhattan", "dot"], label="Distance Metric")
83
  ],
84
  outputs=gr.Plot(),
85
- title="PDF Content Visualizer",
86
  description="""This tool allows you to upload a PDF document, input a query, and visualize the context of the document
87
  as it relates to your query. It uses UMAP for dimensionality reduction and highlights the query and its closest contexts
88
  within the document based on the selected distance metric. Choose from cosine, Euclidean, Manhattan, or dot product metrics
 
41
  distances = -dot_similarity(embeddings, [query_embedding]) # Negated for consistency with other metrics
42
  return distances.flatten()
43
 
44
+ def wrap_text(text, width=40):
45
+ """
46
+ Inserts HTML line breaks for Plotly hover text.
47
+ :param text: The text to wrap.
48
+ :param width: The maximum line width before wrapping.
49
+ :return: Text with line breaks inserted.
50
+ """
51
+ wrapped_text = '<br>'.join([text[i:i+width] for i in range(0, len(text), width)])
52
+ return wrapped_text
53
+
54
  def generate_plotly_figure(query, pdf_file, metric):
55
  logging.info("Generating plot with Plotly.")
56
  query_embedding = model.encode([query])[0]
57
  text = process_pdf(pdf_file.name)
58
  embeddings, sentences = create_embeddings(text)
59
+
60
+ # Wrap text for each sentence
61
+ sentences_wrapped = [wrap_text(sentence) for sentence in sentences]
62
+ all_sentences_wrapped = sentences_wrapped + [wrap_text(query)] # Apply wrapping to the query as well
63
+
64
  all_embeddings = np.vstack([embeddings, query_embedding])
65
+
 
66
  umap_transform = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, random_state=42)
67
  umap_embeddings = umap_transform.fit_transform(all_embeddings)
68
 
 
73
  colors.append('red') # For the query
74
 
75
  fig = go.Figure(data=go.Scatter(x=umap_embeddings[:-1, 0], y=umap_embeddings[:-1, 1], mode='markers',
76
+ marker=dict(color=colors[:-1]), text=all_sentences_wrapped[:-1],
77
+ name='Sentences', hoverinfo='text'))
78
  fig.add_trace(go.Scatter(x=[umap_embeddings[-1, 0]], y=[umap_embeddings[-1, 1]], mode='markers',
79
+ marker=dict(color='red'), text=[all_sentences_wrapped[-1]], name='Query', hoverinfo='text'))
80
  fig.update_layout(title="UMAP Projection of Sentences with Query Highlight", xaxis_title="UMAP 1", yaxis_title="UMAP 2")
81
 
82
  logging.info("Plotly figure created successfully.")
 
96
  gr.Radio(choices=["cosine", "euclidean", "manhattan", "dot"], label="Distance Metric")
97
  ],
98
  outputs=gr.Plot(),
99
+ title="Semantic Search Visualizer",
100
  description="""This tool allows you to upload a PDF document, input a query, and visualize the context of the document
101
  as it relates to your query. It uses UMAP for dimensionality reduction and highlights the query and its closest contexts
102
  within the document based on the selected distance metric. Choose from cosine, Euclidean, Manhattan, or dot product metrics