radames commited on
Commit
b802856
·
1 Parent(s): 5fbd493

Upload 2 files

Browse files
Files changed (2) hide show
  1. lib-example.html +373 -0
  2. utils.js +5 -0
lib-example.html ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <html>
2
+ <head>
3
+ <meta content="text/html;charset=utf-8" http-equiv="Content-Type" />
4
+ <title>Candle Bert</title>
5
+ </head>
6
+ <body></body>
7
+ </html>
8
+
9
+ <!DOCTYPE html>
10
+ <html>
11
+ <head>
12
+ <meta charset="UTF-8" />
13
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
14
+ <style>
15
+ @import url("https://fonts.googleapis.com/css2?family=Source+Code+Pro:wght@200;300;400&family=Source+Sans+3:wght@100;200;300;400;500;600;700;800;900&display=swap");
16
+ html,
17
+ body {
18
+ font-family: "Source Sans 3", sans-serif;
19
+ }
20
+ </style>
21
+ <script src="https://cdn.tailwindcss.com"></script>
22
+ <script type="module" src="./code.js"></script>
23
+ <script type="module">
24
+ import { hcl } from "https://cdn.skypack.dev/d3-color@3";
25
+ import { interpolateReds } from "https://cdn.skypack.dev/d3-scale-chromatic@3";
26
+ import { scaleLinear } from "https://cdn.skypack.dev/d3-scale@4";
27
+ import {
28
+ getModelInfo,
29
+ getEmbeddings,
30
+ getWikiText,
31
+ cosineSimilarity,
32
+ } from "./utils.js";
33
+
34
+ const bertWorker = new Worker("./bertWorker.js", {
35
+ type: "module",
36
+ });
37
+
38
+ const inputContainerEL = document.querySelector("#input-container");
39
+ const textAreaEl = document.querySelector("#input-area");
40
+ const outputAreaEl = document.querySelector("#output-area");
41
+ const formEl = document.querySelector("#form");
42
+ const searchInputEl = document.querySelector("#search-input");
43
+ const formWikiEl = document.querySelector("#form-wiki");
44
+ const searchWikiEl = document.querySelector("#search-wiki");
45
+ const outputStatusEl = document.querySelector("#output-status");
46
+ const modelSelectEl = document.querySelector("#model");
47
+
48
+ const sentencesRegex =
49
+ /(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?)\s/gm;
50
+
51
+ let sentenceEmbeddings = [];
52
+ let currInputText = "";
53
+ let isCalculating = false;
54
+
55
+ function toggleTextArea(state) {
56
+ if (state) {
57
+ textAreaEl.hidden = false;
58
+ textAreaEl.focus();
59
+ } else {
60
+ textAreaEl.hidden = true;
61
+ }
62
+ }
63
+ inputContainerEL.addEventListener("focus", (e) => {
64
+ toggleTextArea(true);
65
+ });
66
+ textAreaEl.addEventListener("blur", (e) => {
67
+ toggleTextArea(false);
68
+ });
69
+ textAreaEl.addEventListener("focusout", (e) => {
70
+ toggleTextArea(false);
71
+ if (currInputText === textAreaEl.value || isCalculating) return;
72
+ populateOutputArea(textAreaEl.value);
73
+ calculateEmbeddings(textAreaEl.value);
74
+ });
75
+
76
+ modelSelectEl.addEventListener("change", (e) => {
77
+ const query = new URLSearchParams(window.location.search);
78
+ query.set("model", modelSelectEl.value);
79
+ window.history.replaceState(
80
+ {},
81
+ "",
82
+ `${window.location.pathname}?${query}`
83
+ );
84
+ if (currInputText === "" || isCalculating) return;
85
+ populateOutputArea(textAreaEl.value);
86
+ calculateEmbeddings(textAreaEl.value);
87
+ });
88
+
89
+ function populateOutputArea(text) {
90
+ currInputText = text;
91
+ const sentences = text.split(sentencesRegex);
92
+
93
+ outputAreaEl.innerHTML = "";
94
+ for (const [id, sentence] of sentences.entries()) {
95
+ const sentenceEl = document.createElement("span");
96
+ sentenceEl.id = `sentence-${id}`;
97
+ sentenceEl.innerText = sentence + " ";
98
+ outputAreaEl.appendChild(sentenceEl);
99
+ }
100
+ }
101
+ formEl.addEventListener("submit", async (e) => {
102
+ e.preventDefault();
103
+ if (isCalculating || currInputText === "") return;
104
+ toggleInputs(true);
105
+ const modelID = modelSelectEl.value;
106
+ const { modelURL, tokenizerURL, configURL, search_prefix } =
107
+ getModelInfo(modelID);
108
+
109
+ const text = searchInputEl.value;
110
+ const query = search_prefix + searchInputEl.value;
111
+ outputStatusEl.classList.remove("invisible");
112
+ outputStatusEl.innerText = "Calculating embeddings for query...";
113
+ isCalculating = true;
114
+ const out = await getEmbeddings(
115
+ bertWorker,
116
+ modelURL,
117
+ tokenizerURL,
118
+ configURL,
119
+ modelID,
120
+ [query]
121
+ );
122
+ outputStatusEl.classList.add("invisible");
123
+ const queryEmbeddings = out.output[0];
124
+ // calculate cosine similarity with all sentences given the query
125
+ const distances = sentenceEmbeddings
126
+ .map((embedding, id) => ({
127
+ id,
128
+ similarity: cosineSimilarity(queryEmbeddings, embedding),
129
+ }))
130
+ .sort((a, b) => b.similarity - a.similarity)
131
+ // getting top 10 most similar sentences
132
+ .slice(0, 10);
133
+
134
+ const colorScale = scaleLinear()
135
+ .domain([
136
+ distances[distances.length - 1].similarity,
137
+ distances[0].similarity,
138
+ ])
139
+ .range([0, 1])
140
+ .interpolate(() => interpolateReds);
141
+ outputAreaEl.querySelectorAll("span").forEach((el) => {
142
+ el.style.color = "unset";
143
+ el.style.backgroundColor = "unset";
144
+ });
145
+ distances.forEach((d) => {
146
+ const el = outputAreaEl.querySelector(`#sentence-${d.id}`);
147
+ const color = colorScale(d.similarity);
148
+ const fontColor = hcl(color).l < 70 ? "white" : "black";
149
+ el.style.color = fontColor;
150
+ el.style.backgroundColor = color;
151
+ });
152
+
153
+ outputAreaEl
154
+ .querySelector(`#sentence-${distances[0].id}`)
155
+ .scrollIntoView({
156
+ behavior: "smooth",
157
+ block: "center",
158
+ inline: "nearest",
159
+ });
160
+
161
+ isCalculating = false;
162
+ toggleInputs(false);
163
+ });
164
+ async function calculateEmbeddings(text) {
165
+ isCalculating = true;
166
+ toggleInputs(true);
167
+ const modelID = modelSelectEl.value;
168
+ const { modelURL, tokenizerURL, configURL, document_prefix } =
169
+ getModelInfo(modelID);
170
+
171
+ const sentences = text.split(sentencesRegex);
172
+ const allEmbeddings = [];
173
+ outputStatusEl.classList.remove("invisible");
174
+ for (const [id, sentence] of sentences.entries()) {
175
+ const query = document_prefix + sentence;
176
+ outputStatusEl.innerText = `Calculating embeddings: sentence ${
177
+ id + 1
178
+ } of ${sentences.length}`;
179
+ const embeddings = await getEmbeddings(
180
+ bertWorker,
181
+ modelURL,
182
+ tokenizerURL,
183
+ configURL,
184
+ modelID,
185
+ [query],
186
+ updateStatus
187
+ );
188
+ allEmbeddings.push(embeddings);
189
+ }
190
+ outputStatusEl.classList.add("invisible");
191
+ sentenceEmbeddings = allEmbeddings.map((e) => e.output[0]);
192
+ isCalculating = false;
193
+ toggleInputs(false);
194
+ }
195
+
196
+ function updateStatus(data) {
197
+ if ("status" in data) {
198
+ if (data.status === "loading") {
199
+ outputStatusEl.innerText = data.message;
200
+ outputStatusEl.classList.remove("invisible");
201
+ }
202
+ }
203
+ }
204
+ function toggleInputs(state) {
205
+ const interactive = document.querySelectorAll(".interactive");
206
+ interactive.forEach((el) => {
207
+ if (state) {
208
+ el.disabled = true;
209
+ } else {
210
+ el.disabled = false;
211
+ }
212
+ });
213
+ }
214
+
215
+ searchWikiEl.addEventListener("input", () => {
216
+ searchWikiEl.setCustomValidity("");
217
+ });
218
+
219
+ formWikiEl.addEventListener("submit", async (e) => {
220
+ e.preventDefault();
221
+ if ("example" in e.submitter.dataset) {
222
+ searchWikiEl.value = e.submitter.innerText;
223
+ }
224
+ const text = searchWikiEl.value;
225
+
226
+ if (isCalculating || text === "") return;
227
+ try {
228
+ const wikiText = await getWikiText(text);
229
+ searchWikiEl.setCustomValidity("");
230
+ textAreaEl.innerHTML = wikiText;
231
+ populateOutputArea(wikiText);
232
+ calculateEmbeddings(wikiText);
233
+ searchWikiEl.value = "";
234
+ } catch {
235
+ searchWikiEl.setCustomValidity("Invalid Wikipedia article name");
236
+ searchWikiEl.reportValidity();
237
+ }
238
+ });
239
+ document.addEventListener("DOMContentLoaded", () => {
240
+ const query = new URLSearchParams(window.location.search);
241
+ const modelID = query.get("model");
242
+ if (modelID) {
243
+ modelSelectEl.value = modelID;
244
+ modelSelectEl.dispatchEvent(new Event("change"));
245
+ }
246
+ });
247
+ </script>
248
+ </head>
249
+ <body class="container max-w-4xl mx-auto p-4">
250
+ <main class="grid grid-cols-1 gap-5 relative">
251
+ <span class="absolute text-5xl -ml-[1em]"> 🕯️ </span>
252
+ <div>
253
+ <h1 class="text-5xl font-bold">Candle BERT</h1>
254
+ <h2 class="text-2xl font-bold">Rust/WASM Demo</h2>
255
+ <p class="max-w-lg">
256
+ Running sentence embeddings and similarity search in the browser using
257
+ the Bert Model written with
258
+ <a
259
+ href="https://github.com/huggingface/candle/"
260
+ target="_blank"
261
+ class="underline hover:text-blue-500 hover:no-underline"
262
+ >Candle
263
+ </a>
264
+ and compiled to Wasm. Embeddings models from are from
265
+ <a
266
+ href="https://huggingface.co/sentence-transformers/"
267
+ target="_blank"
268
+ class="underline hover:text-blue-500 hover:no-underline">
269
+ Sentence Transformers
270
+ </a>
271
+ and
272
+ <a
273
+ href="https://huggingface.co/intfloat/"
274
+ target="_blank"
275
+ class="underline hover:text-blue-500 hover:no-underline">
276
+ Liang Wang - e5 Models
277
+ </a>
278
+ </p>
279
+ </div>
280
+
281
+ <div>
282
+ <label for="model" class="font-medium block">Models Options: </label>
283
+ <select
284
+ id="model"
285
+ class="border-2 border-gray-500 rounded-md font-light interactive disabled:cursor-not-allowed w-full max-w-max">
286
+ <option value="gte_tiny">gte_tiny (45.5 MB)</option>
287
+ <option value="intfloat_e5_small_v2" selected>
288
+ intfloat/e5-small-v2 (133 MB)
289
+ </option>
290
+ <option value="intfloat_e5_base_v2">
291
+ intfloat/e5-base-v2 (438 MB)
292
+ </option>
293
+ <option value="intfloat_multilingual_e5_small">
294
+ intfloat/multilingual-e5-small (471 MB)
295
+ </option>
296
+ <option value="sentence_transformers_all_MiniLM_L6_v2">
297
+ sentence-transformers/all-MiniLM-L6-v2 (90.9 MB)
298
+ </option>
299
+ <option value="sentence_transformers_all_MiniLM_L12_v2">
300
+ sentence-transformers/all-MiniLM-L12-v2 (133 MB)
301
+ </option>
302
+ </select>
303
+ </div>
304
+ <div>
305
+ <h3 class="font-medium">Examples:</h3>
306
+ <form
307
+ id="form-wiki"
308
+ class="flex text-xs rounded-md justify-between w-min gap-3">
309
+ <input type="submit" hidden />
310
+
311
+ <button data-example class="disabled:cursor-not-allowed interactive">
312
+ Pizza
313
+ </button>
314
+ <button data-example class="disabled:cursor-not-allowed interactive">
315
+ Paris
316
+ </button>
317
+ <button data-example class="disabled:cursor-not-allowed interactive">
318
+ Physics
319
+ </button>
320
+ <input
321
+ type="text"
322
+ id="search-wiki"
323
+ title="Search Wikipedia article by title"
324
+ class="font-light py-0 mx-1 resize-none outline-none w-32 disabled:cursor-not-allowed interactive"
325
+ placeholder="Load Wikipedia article..." />
326
+ <button
327
+ title="Search Wikipedia article and load into input"
328
+ class="bg-gray-700 hover:bg-gray-800 text-white font-normal px-2 py-1 rounded disabled:bg-gray-300 disabled:cursor-not-allowed interactive">
329
+ Load
330
+ </button>
331
+ </form>
332
+ </div>
333
+ <form
334
+ id="form"
335
+ class="flex text-normal px-1 py-1 border border-gray-700 rounded-md items-center">
336
+ <input type="submit" hidden />
337
+ <input
338
+ type="text"
339
+ id="search-input"
340
+ class="font-light w-full px-3 py-2 mx-1 resize-none outline-none interactive disabled:cursor-not-allowed"
341
+ placeholder="Search query here..." />
342
+ <button
343
+ class="bg-gray-700 hover:bg-gray-800 text-white font-normal py-2 w-16 rounded disabled:bg-gray-300 disabled:cursor-not-allowed interactive">
344
+ Search
345
+ </button>
346
+ </form>
347
+ <div>
348
+ <h3 class="font-medium">Input text:</h3>
349
+ <div class="flex justify-between items-center">
350
+ <div class="rounded-md inline text-xs">
351
+ <span id="output-status" class="m-auto font-light invisible"
352
+ >C</span
353
+ >
354
+ </div>
355
+ </div>
356
+ <div
357
+ id="input-container"
358
+ tabindex="0"
359
+ class="min-h-[250px] bg-slate-100 text-gray-500 rounded-md p-4 flex flex-col gap-2 relative">
360
+ <textarea
361
+ id="input-area"
362
+ hidden
363
+ value=""
364
+ placeholder="Input text to perform semantic similarity search..."
365
+ class="flex-1 resize-none outline-none left-0 right-0 top-0 bottom-0 m-4 absolute interactive disabled:invisible"></textarea>
366
+ <p id="output-area" class="grid-rows-2">
367
+ Input text to perform semantic similarity search...
368
+ </p>
369
+ </div>
370
+ </div>
371
+ </main>
372
+ </body>
373
+ </html>
utils.js CHANGED
@@ -59,6 +59,11 @@ const MODELS = {
59
  search_prefix: "",
60
  document_prefix: "",
61
  },
 
 
 
 
 
62
  };
63
  export function getModelInfo(id) {
64
  return {
 
59
  search_prefix: "",
60
  document_prefix: "",
61
  },
62
+ gte_tiny: {
63
+ base_url: "https://huggingface.co/TaylorAI/gte-tiny/resolve/refs%2Fpr%2F2/",
64
+ search_prefix: "",
65
+ document_prefix: "",
66
+ },
67
  };
68
  export function getModelInfo(id) {
69
  return {