Spaces:

chansung
/

e5-demo

Sleeping

File size: 5,163 Bytes

import json
import torch
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

import gradio as gr

# instantiate tokenizer and model
def get_model(base_name='intfloat/e5-large-v2'):
    tokenizer = AutoTokenizer.from_pretrained(base_name)
    model = AutoModel.from_pretrained(base_name)
    
    return tokenizer, model

# get normalized scores on input_texts, the final scores are
# reported without queries, and the number of queries should
# be denoted as in how_many_q
def get_scores(model, tokenizer, input_texts, max_length=512, how_many_q=1, normalize=True):
    # Tokenize the input texts
    batch_dict = tokenizer(
        input_texts,
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors='pt'
    )

    outputs = model(**batch_dict)
    embeddings = average_pool(
        outputs.last_hidden_state, batch_dict['attention_mask']
    )

    # (Optionally) normalize embeddings
    if normalize:
        embeddings = F.normalize(embeddings, p=2, dim=1)
        
    scores = (embeddings[:how_many_q] @ embeddings[how_many_q:].T) * 100
    return scores

# get top n results out of the scores. This
# function only returns the scores and indices
def get_top(scores, top_k=None):
    result = torch.sort(scores, descending=True, dim=1)
    top_indices = result.indices
    top_values = result.values

    if top_k:
        top_indices = top_indices[:, :top_k]
        top_values = top_values[:, :top_k]

    return top_indices, top_values

# get top n results out of the scores. This function
# returns scores and indices along with the associated text
def get_human_readable_top(scores, input_texts, top_k=None):
    input_texts = list(filter(lambda text: "query:" not in text, input_texts))
    top_indices, top_values = get_top(scores, top_k)

    result = {}
    for input_idx, (indices, values) in enumerate(zip(top_indices, top_values)):
        q = input_texts[input_idx]
        a = []

        for idx, val in zip(indices.tolist(), values.tolist()):
            a.append({
            "idx": idx,
            "val": round(val, 3),
            "text": input_texts[idx]
            })

        result[q] = a

    return result

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def get_result(q_txt, p_txt1, p_txt2, p_txt3, p_txt4, p_txt5):
    input_texts = [
        f"query: {q_txt}"
    ]

    if p_txt1 != '':
        input_texts.append(f"passage: {p_txt1}")

    if p_txt2 != '':
        input_texts.append(f"passage: {p_txt2}")

    if p_txt3 != '':
        input_texts.append(f"passage: {p_txt3}")        

    if p_txt4 != '':
        input_texts.append(f"passage: {p_txt4}")

    if p_txt5 != '':
        input_texts.append(f"passage: {p_txt5}")

    scores = get_scores(model, tokenizer, input_texts)
    result = get_human_readable_top(scores, input_texts)
    return json.dumps(result, indent=4)

tokenizer, model = get_model('intfloat/e5-large-v2')

with gr.Blocks() as demo:
    gr.Markdown("# E5 Large V2 Demo")
    
    q_txt = gr.Textbox(placeholder="Enter your query", label="Query")

    p_txt1 = gr.Textbox(placeholder="Enter passage 1", label="Passage 1")
    p_txt2 = gr.Textbox(placeholder="Enter passage 2", label="Passage 2")
    p_txt3 = gr.Textbox(placeholder="Enter passage 3", label="Passage 3")
    p_txt4 = gr.Textbox(placeholder="Enter passage 4", label="Passage 4")
    p_txt5 = gr.Textbox(placeholder="Enter passage 5", label="Passage 5")
    submit = gr.Button("Submit")
    o_txt = gr.Textbox(placeholder="Output", lines=10, interactive=False, label="Output")

    gr.Examples(
        [
            [
                "I'm searching for a planet not too far from Earth.",
                "Neptune is the eighth and farthest-known Solar planet from the Sun. In the Solar System, it is the fourth-largest planet by diameter, the third-most-massive planet, and the densest giant planet. It is 17 times the mass of Earth, slightly more massive than its near-twin Uranus.",
                "TRAPPIST-1d, also designated as 2MASS J23062928-0502285 d, is a small exoplanet (about 30% the mass of the earth), which orbits on the inner edge of the habitable zone of the ultracool dwarf star TRAPPIST-1 approximately 40 light-years (12.1 parsecs, or nearly 3.7336×1014 km) away from Earth in the constellation of Aquarius.",
                "A harsh desert world orbiting twin suns in the galaxy’s Outer Rim, Tatooine is a lawless place ruled by Hutt gangsters. Many settlers scratch out a living on moisture farms, while spaceport cities such as Mos Eisley and Mos Espa serve as home base for smugglers, criminals, and other rogues.",
                "",
                ""
            ]
        ], 
        inputs=[q_txt, p_txt1, p_txt2, p_txt3, p_txt4, p_txt5]
    )
    
    submit.click(
        get_result,
        [q_txt, p_txt1, p_txt2, p_txt3, p_txt4, p_txt5],
        o_txt
    )

demo.launch()