|
import torch |
|
from sentence_transformers import SentenceTransformer, util |
|
import pandas as pd |
|
import gradio as gr |
|
|
|
def save_embeddings(sentences, filename): |
|
embeddings = model.encode(sentences, convert_to_tensor=True) |
|
torch.save(embeddings, filename) |
|
|
|
def load_embeddings(filename): |
|
return torch.load(filename, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu')) |
|
|
|
def preprocess_space_descriptions(file_path): |
|
encodings = ['utf-8', 'latin-1', 'utf-16'] |
|
for encoding in encodings: |
|
try: |
|
df = pd.read_csv(file_path, sep='\t', header=None, names=['space_id', 'description']) |
|
df.dropna(subset=['description'], inplace=True) |
|
space_ids = df['space_id'].tolist() |
|
descriptions = df['description'].tolist() |
|
break |
|
except UnicodeDecodeError: |
|
continue |
|
else: |
|
raise UnicodeDecodeError("Unable to decode the file using the available encodings.") |
|
|
|
return space_ids, descriptions |
|
|
|
def perform_similarity_search(query_embeddings, embeddings, space_ids, descriptions, top_k=10): |
|
cosine_scores = util.cos_sim(query_embeddings, embeddings) |
|
similarity_scores = cosine_scores.tolist() |
|
|
|
results = [] |
|
for i, query_embedding in enumerate(query_embeddings): |
|
query_results = sorted(zip(space_ids, descriptions, similarity_scores[i]), key=lambda x: x[2], reverse=True)[:top_k] |
|
results.extend(query_results) |
|
|
|
return pd.DataFrame(results, columns=["space_id", "description", "score"]) |
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
space_ids, descriptions = preprocess_space_descriptions('hf_spaces_descriptions.tsv') |
|
embeddings = load_embeddings('embeddings_hf_spaces_descriptions.pt') |
|
|
|
with gr.Blocks() as demo: |
|
input = gr.Textbox(label="Enter your query") |
|
num_results = gr.Slider(10, 100, value=10, step=1, label="Number of results") |
|
|
|
df_output = gr.Dataframe(label="Similarity Results", wrap=True) |
|
|
|
def search(query, num_results): |
|
query_embedding = model.encode([query], convert_to_tensor=True) |
|
return perform_similarity_search(query_embedding, embeddings, space_ids, descriptions, top_k=num_results) |
|
|
|
input.submit(search, inputs=[input, num_results], outputs=df_output, api_name="search") |
|
|
|
demo.launch() |
|
|