Spaces:
Sleeping
Sleeping
File size: 1,625 Bytes
b352653 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
import torch
model_ckpt = "BAAI/bge-large-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
def cls_pooling(model_output):
return model_output.last_hidden_state[:, 0]
def get_embeddings(text_list):
encoded_input = tokenizer(
text_list, padding=True, truncation=True, return_tensors="pt"
)
encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
model_output = model(**encoded_input)
return cls_pooling(model_output)
embeddings_doc_dataset = load_dataset("fashxp/pimcore-docs-embeddings-gpe")
embeddings_doc_dataset = embeddings_doc_dataset['train']
embeddings_doc_dataset.add_faiss_index(column="embeddings")
import pandas as pd
def find_in_docs(question):
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape
scores, samples = embeddings_doc_dataset.get_nearest_examples(
"embeddings", question_embedding, k=10
)
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)
result = ''
for _, row in samples_df.iterrows():
result = result + f"HEADING: {row.heading}\n" + f"SCORE: {row.scores}\n" + f"URL: {row.url}\n" + ("=" * 50) + "\n\n"
return result
import gradio as gr
demo = gr.Interface(fn=find_in_docs, inputs="text", outputs="text")
demo.launch(share=True) |