File size: 3,121 Bytes
bddb0ac
90dfdae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8e3961
 
90dfdae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dd63a4
 
 
 
 
90dfdae
 
 
 
 
 
 
d8e3961
 
 
4dd63a4
90dfdae
 
 
 
 
 
 
 
 
 
 
 
 
 
52b4dce
90dfdae
475c76c
90dfdae
 
475c76c
 
dedb273
 
 
 
 
 
 
52b4dce
dedb273
52b4dce
 
90dfdae
 
 
 
52b4dce
 
 
475c76c
 
52b4dce
90dfdae
bddb0ac
90dfdae
 
bddb0ac
90dfdae
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
from dataclasses import dataclass
from operator import add, sub

import gradio as gr

import numpy as np
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity

from pyparsing import Word, alphas, Char, ParseException

term = Word(alphas)
operator = Char("+ -")

expression = term + (operator + term)[...]
operations = {"+": add, "-": sub}


def parse_expression(input):
    try:
        return expression.parseString(input)
    except ParseException as e:
        raise gr.Error(f"Parsing error: {e.msg} at position [{e.loc}].")


def evaluate_expression(input):
    # Skip every other item
    words = input[::2]
    operators = input[1::2]

    result = word_to_vectors(words[0])

    for operator, word in zip(operators, words[1:]):
        result = operations[operator](result, word_to_vectors(word))

    return result


dataset = load_dataset("karmiq/glove", split="train")
df = dataset.to_pandas()

all_words = df["word"].to_numpy()
all_vectors = np.array(df["embeddings"].to_list())


def word_to_vectors(word):
    result = df.loc[df["word"] == word].embeddings.to_numpy()
    if len(result) < 1:
        raise gr.Error("Word not found in the dictionary.")
    else:
        return result[0]


def expression_to_vectors(input):
    return evaluate_expression(parse_expression(input))


def get_results(expression):
    if len(expression) < 1:
        raise gr.Error("Please provide an expression.")

    expression = expression.lower()
    vectors = expression_to_vectors(expression)
    similarity_scores = cosine_similarity([vectors], all_vectors)[0]
    top_indices = np.argsort(similarity_scores)[::-1]
    return dict(
        [
            (all_words[i], similarity_scores[i])
            for i in top_indices
            if not all_words[i] in expression.split()
        ][:10]
    )


examples = [
    "king - man + woman",
    "mother - woman + man",
    "berlin - germany + france",
    "saxophone - jazz + classical",
]

initial_output = get_results(examples[0])

css = """
button.gallery-item { color: var(--body-text-color) !important; }
.output-class { color: var(--color-red-700) !important; }
.confidence-set .label .text { font-weight: var(--weight-medium); }
.confidence-set:hover .label { color: var(--color-red-700) !important; }
"""

with gr.Blocks(
    css=css,
    theme=gr.themes.Monochrome(radius_size=gr.themes.sizes.radius_sm),
) as app:
    with gr.Row():
        with gr.Column():
            input = gr.Textbox(value=examples[0], label="Expression")
            with gr.Row():
                btn = gr.Button("Run", variant="primary")
            with gr.Row():
                gr.Markdown(
                    "Demonstration of computing cosine similarity of embeddings "
                    "from the [GloVe](https://nlp.stanford.edu/projects/glove/) dataset."
                )
            with gr.Row():
                gr.Examples(examples, inputs=input)

        with gr.Column():
            output = gr.Label(label="Closest words", value=initial_output)

    btn.click(fn=get_results, inputs=input, outputs=output)

app.launch()