Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -13,8 +13,32 @@ class NumpyEncoder(json.JSONEncoder):
|
|
13 |
return json.JSONEncoder.default(self, obj)
|
14 |
|
15 |
def text_to_embedding(text):
|
16 |
-
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
inputs = gr.inputs.Textbox(default="Type text here.")
|
20 |
outputs = gr.outputs.Textbox()
|
|
|
13 |
return json.JSONEncoder.default(self, obj)
|
14 |
|
15 |
def text_to_embedding(text):
|
16 |
+
# Tokenize the input text
|
17 |
+
tokens = model.tokenize(text)
|
18 |
+
|
19 |
+
# Check if the token count exceeds the model's maximum sequence length
|
20 |
+
if len(tokens) > model.max_seq_length:
|
21 |
+
|
22 |
+
# Split the input text into chunks
|
23 |
+
chunks = []
|
24 |
+
for i in range(0, len(tokens), model.max_seq_length):
|
25 |
+
chunk = tokens[i:i + model.max_seq_length]
|
26 |
+
chunks.append(model.tokenizer.convert_tokens_to_string(chunk))
|
27 |
+
|
28 |
+
# Encode each chunk and store the embeddings
|
29 |
+
embeddings = []
|
30 |
+
for chunk in chunks:
|
31 |
+
embedding = model.encode(chunk)
|
32 |
+
embeddings.append(embedding)
|
33 |
+
|
34 |
+
# Calculate the average embedding
|
35 |
+
avg_embedding = np.mean(embeddings, axis=0)
|
36 |
+
|
37 |
+
else:
|
38 |
+
# If the token count is within the limit, just encode the input text
|
39 |
+
avg_embedding = model.encode(text)
|
40 |
+
|
41 |
+
return json.dumps(avg_embedding, cls=NumpyEncoder)
|
42 |
|
43 |
inputs = gr.inputs.Textbox(default="Type text here.")
|
44 |
outputs = gr.outputs.Textbox()
|