import logging import gradio as gr import datasets import sentence_transformers logging.disable(logging.CRITICAL) model = sentence_transformers.SentenceTransformer( "dangvantuan/sentence-camembert-large", device="cuda" ) dataset = datasets.load_dataset("json", data_files=["./dataset.json"], split="train") dataset.load_faiss_index("embeddings", "index.faiss") def search(query: str, k: int): query_embedding = model.encode(query) _, retrieved_examples = dataset.get_nearest_examples( "embeddings", query_embedding, k=k, ) results = [] for text, start, end, title, url in zip( retrieved_examples["text"], retrieved_examples["start"], retrieved_examples["end"], retrieved_examples["title"], retrieved_examples["url"], ): start = start end = end result = { "title": title, "transcript": f"[{str(start)+' ====> '+str(end)}] {text}", "link": url } results.append(result) return results iface = gr.Interface( fn=search, inputs=["text", "number"], outputs=gr.outputs.JSON(), title="Search Dataset", description="Search a dataset using Camembert and Faiss.", example=[ "Enter a query to search for.", 5 ] ) if __name__ == "__main__": iface.launch()