Spaces:
Sleeping
Sleeping
import argparse | |
import logging | |
import sentence_transformers | |
import datasets | |
import gradio as gr | |
logging.disable(logging.CRITICAL) | |
model = sentence_transformers.SentenceTransformer( | |
"dangvantuan/sentence-camembert-large", device="cpu") | |
dataset = datasets.load_dataset("json", data_files=["./data/dataset.json"], split="train") | |
dataset.load_faiss_index("embeddings", "index.faiss") | |
def search(query, k): | |
query_embedding = model.encode(query) | |
_, retrieved_examples = dataset.get_nearest_examples( | |
"embeddings", | |
query_embedding, | |
k=int(k), | |
) | |
results = [] | |
for text, start, end, title, url in zip( | |
retrieved_examples["text"], | |
retrieved_examples["start"], | |
retrieved_examples["end"], | |
retrieved_examples["title"], | |
retrieved_examples["url"], | |
): | |
start = start | |
end = end | |
result = { | |
"title": title, | |
"transcript": f"[{str(start)} ====> {str(end)}] {text}", | |
"link": url, | |
} | |
results.append(result) | |
return results | |
iface = gr.Interface( | |
search, | |
inputs=[ | |
gr.inputs.Textbox(label="Query"), | |
gr.inputs.Number(label="K", default=3, min_value=1, max_value=10), | |
], | |
outputs=[ | |
gr.outputs.Textbox(label="Result 1"), | |
gr.outputs.Textbox(label="Result 2"), | |
gr.outputs.Textbox(label="Result 3"), | |
], | |
title="Camembert and Faiss-powered Search Engine", | |
description="Search through a dataset using Camembert and Faiss", | |
theme="light", | |
layout="vertical", | |
) | |
iface.launch() | |