mikesoylu commited on
Commit
d1afd99
1 Parent(s): ef4c9c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -2
app.py CHANGED
@@ -13,8 +13,32 @@ class NumpyEncoder(json.JSONEncoder):
13
  return json.JSONEncoder.default(self, obj)
14
 
15
  def text_to_embedding(text):
16
- embedding = model.encode(text)
17
- return json.dumps(embedding, cls=NumpyEncoder)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  inputs = gr.inputs.Textbox(default="Type text here.")
20
  outputs = gr.outputs.Textbox()
 
13
  return json.JSONEncoder.default(self, obj)
14
 
15
  def text_to_embedding(text):
16
+ # Tokenize the input text
17
+ tokens = model.tokenize(text)
18
+
19
+ # Check if the token count exceeds the model's maximum sequence length
20
+ if len(tokens) > model.max_seq_length:
21
+
22
+ # Split the input text into chunks
23
+ chunks = []
24
+ for i in range(0, len(tokens), model.max_seq_length):
25
+ chunk = tokens[i:i + model.max_seq_length]
26
+ chunks.append(model.tokenizer.convert_tokens_to_string(chunk))
27
+
28
+ # Encode each chunk and store the embeddings
29
+ embeddings = []
30
+ for chunk in chunks:
31
+ embedding = model.encode(chunk)
32
+ embeddings.append(embedding)
33
+
34
+ # Calculate the average embedding
35
+ avg_embedding = np.mean(embeddings, axis=0)
36
+
37
+ else:
38
+ # If the token count is within the limit, just encode the input text
39
+ avg_embedding = model.encode(text)
40
+
41
+ return json.dumps(avg_embedding, cls=NumpyEncoder)
42
 
43
  inputs = gr.inputs.Textbox(default="Type text here.")
44
  outputs = gr.outputs.Textbox()