File size: 5,004 Bytes
e6c2600
 
 
 
 
 
 
f3cc1a4
e6c2600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4073415
 
 
1b97a54
4073415
e6c2600
 
 
 
 
 
 
 
 
 
 
 
 
 
9c4539c
59e95b6
e6c2600
 
 
 
 
 
d2f8be4
e6c2600
 
 
d2f8be4
e6c2600
 
 
6de608f
e6c2600
 
 
 
 
 
d2f8be4
e6c2600
 
 
 
 
 
 
d2f8be4
e6c2600
4b4fff0
e6c2600
 
 
 
 
d2f8be4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import gradio as gr
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import euclidean_distances

# Load DataFrame
text_embeddings = pd.read_parquet('text_embeddings_abstract_generated_by_LLM.parquet')

# Initialize models
model_all_Mini = SentenceTransformer('all-MiniLM-L6-v2')
model_e5_large_v2 = SentenceTransformer('intfloat/e5-large-v2')
model_e5_small_v2 = SentenceTransformer('intfloat/e5-small-v2')
model_gte_large = SentenceTransformer('thenlper/gte-large')
model_GIST_large = SentenceTransformer('avsolatorio/GIST-large-Embedding-v0')

# Model selection drop-down list
model_options = {
    'all-MiniLM-L6-v2': model_all_Mini,
    'intfloat/e5-large-v2': model_e5_large_v2,
    'intfloat/e5-small-v2': model_e5_small_v2,
    'thenlper/gte-large': model_gte_large,
    'avsolatorio/GIST-large-Embedding-v0': model_GIST_large
}

# Main function for the Gradio interface
def find_similar_texts(model_name, input_text):

    # Check whether model has been selected
    if not model_name:
        return "You have forgotten to select a sentence-transformer."
    
    # Check whether there are abstracts matching the text input
    input_embedding_mini = model_all_Mini.encode(input_text).reshape(1, -1)
    embedding_matrix_mini = np.vstack(text_embeddings['embedding_all-MiniLM-L6-v2'])
    distances_mini = euclidean_distances(embedding_matrix_mini, input_embedding_mini).flatten()

    # Only continue if similar abstract found
    if any(distances_mini < 1.05):
        selected_model = model_options[model_name]
        embedding_column = 'embedding_' + model_name
        input_embedding = selected_model.encode(input_text).reshape(1, -1)
        embedding_matrix = np.vstack(text_embeddings[embedding_column])
        distances = euclidean_distances(embedding_matrix, input_embedding).flatten()
        text_embeddings['euclidean_distance'] = distances
        sorted_embeddings = text_embeddings.sort_values(by='euclidean_distance', ascending=True)
        top_five = sorted_embeddings.head(5)[['abstract', 'patent no', 'title']]
        formatted_output = '\n\n'.join([f"Patent No: {row['patent no']}\n: {row['title']}\n {row['abstract']}\n" for index, row in top_five.iterrows()])
        return formatted_output
    else:
        return "It seems there is no patent abstract close to your description."

# Create Gradio interface using Blocks
with gr.Blocks() as demo:
    gr.Markdown("## Sentence-Transformer based AI-Generated-Patent-Abstract Search")
    with gr.Row():
        with gr.Column():
            model_selector = gr.Dropdown(choices=list(model_options.keys()), label="Chose Sentence-Transformer")
            text_input = gr.Textbox(lines=2, placeholder="machine learning for drug dosing", label="input_text (example: machine learning for drug dosing. Remark: This is only a small number of AI generated machine learning patents!)")
            submit_button = gr.Button("search")
            
        with gr.Column():
            output = gr.Textbox(label="top 5 patent abstracts (if available)")
    
    submit_button.click(find_similar_texts, inputs=[model_selector, text_input], outputs=output)


    gr.Markdown("""
    ### Description
    This demo app leverages several Sentence Transformer models to compute the semantic distance between user input and a small number of AI generated patent abstracts in the field of machine learning and AI.

- 'all-MiniLM-L6-v2': embedding size is 384. [More info](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) and [here](https://sbert.net/).
- 'intfloat/e5-large-v2'. Text Embeddings by Weakly-Supervised Contrastive Pre-training, embedding size is 1024. [More info](https://huggingface.co/intfloat/e5-large-v2).
- 'intfloat/e5-small-v2': Text Embeddings by Weakly-Supervised Contrastive Pre-training, embedding size is 384. [More info](https://huggingface.co/intfloat/e5-small-v2).
- 'thenlper/gte-large': General Text Embeddings (GTE) model, embedding size is 1024. [More info](https://huggingface.co/thenlper/gte-large) and [here](https://arxiv.org/abs/2308.03281).
- 'avsolatorio/GIST-large-Embedding-v0': Fine-tuned on top of the BAAI/bge-large-en-v1.5 using the MEDI dataset augmented with mined triplets from the MTEB Classification training dataset, embedding size is 1024. [More info](https://huggingface.co/avsolatorio/GIST-large-Embedding-v0) and [here](https://arxiv.org/abs/2402.16829).



<strong>Please note: The data used in this demo are AI generated and is intended only for demonstration purposes. The demo does not provide any real patent information.
    """)
    model_selector.change(find_similar_texts, inputs=[model_selector, text_input], outputs=output)
    text_input.submit(find_similar_texts, inputs=[model_selector, text_input], outputs=output)

demo.launch()

#The patents can be viewed at [Espacenet](https://worldwide.espacenet.com/?locale=en_EP), the free onine service by the European Patent Office.