from transformers import AutoTokenizer, AutoModel from datasets import load_dataset import torch model_ckpt = "BAAI/bge-large-en-v1.5" tokenizer = AutoTokenizer.from_pretrained(model_ckpt) model = AutoModel.from_pretrained(model_ckpt) device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") model.to(device) def cls_pooling(model_output): return model_output.last_hidden_state[:, 0] def get_embeddings(text_list): encoded_input = tokenizer( text_list, padding=True, truncation=True, return_tensors="pt" ) encoded_input = {k: v.to(device) for k, v in encoded_input.items()} model_output = model(**encoded_input) return cls_pooling(model_output) embeddings_doc_dataset = load_dataset("fashxp/pimcore-docs-embeddings-gpe") embeddings_doc_dataset = embeddings_doc_dataset['train'] embeddings_doc_dataset.add_faiss_index(column="embeddings") import pandas as pd def find_in_docs(question): question_embedding = get_embeddings([question]).cpu().detach().numpy() question_embedding.shape scores, samples = embeddings_doc_dataset.get_nearest_examples( "embeddings", question_embedding, k=10 ) samples_df = pd.DataFrame.from_dict(samples) samples_df["scores"] = scores samples_df.sort_values("scores", ascending=False, inplace=True) result = '' for _, row in samples_df.iterrows(): result = result + f"HEADING: {row.heading}\n" + f"SCORE: {row.scores}\n" + f"URL: {row.url}\n" + ("=" * 50) + "\n\n" return result import gradio as gr demo = gr.Interface(fn=find_in_docs, inputs="text", outputs="text") demo.launch(share=True)