Spaces:
Running
Running
Germano Cavalcante
commited on
Commit
·
8aab0fd
1
Parent(s):
c7f8eb7
Fix Openai API for embeddings
Browse files- config.py +3 -0
- requirements-fastapi.txt +2 -1
- routers/tool_find_related.py +28 -14
config.py
CHANGED
@@ -4,9 +4,12 @@ import os
|
|
4 |
|
5 |
class Settings(BaseSettings):
|
6 |
huggingface_key: str = os.environ.get("huggingface_key")
|
|
|
7 |
cache_dir: str = "cache"
|
8 |
embedding_api: str = "sbert"
|
9 |
embedding_model: str = "mano-wii/BAAI_bge-base-en-v1.5-tunned-for-blender-issues"
|
|
|
|
|
10 |
|
11 |
|
12 |
settings = Settings()
|
|
|
4 |
|
5 |
class Settings(BaseSettings):
|
6 |
huggingface_key: str = os.environ.get("huggingface_key")
|
7 |
+
OPENAI_API_KEY: str = os.environ.get("OPENAI_API_KEY")
|
8 |
cache_dir: str = "cache"
|
9 |
embedding_api: str = "sbert"
|
10 |
embedding_model: str = "mano-wii/BAAI_bge-base-en-v1.5-tunned-for-blender-issues"
|
11 |
+
# embedding_api: str = "openai"
|
12 |
+
# embedding_model: str = "text-embedding-ada-002"
|
13 |
|
14 |
|
15 |
settings = Settings()
|
requirements-fastapi.txt
CHANGED
@@ -3,4 +3,5 @@ uvicorn[standard]
|
|
3 |
python-multipart
|
4 |
pydantic-settings
|
5 |
huggingface_hub
|
6 |
-
sentence_transformers
|
|
|
|
3 |
python-multipart
|
4 |
pydantic-settings
|
5 |
huggingface_hub
|
6 |
+
sentence_transformers
|
7 |
+
openai
|
routers/tool_find_related.py
CHANGED
@@ -62,6 +62,7 @@ class EmbeddingContext:
|
|
62 |
# Set when creating the object
|
63 |
lock = None
|
64 |
model = None
|
|
|
65 |
model_name = ''
|
66 |
config_type = ''
|
67 |
|
@@ -86,7 +87,11 @@ class EmbeddingContext:
|
|
86 |
self.model = self.model.to('cuda')
|
87 |
|
88 |
elif config_type == 'openai':
|
89 |
-
|
|
|
|
|
|
|
|
|
90 |
self.encode = self.encode_openai
|
91 |
|
92 |
self.model_name = model_name
|
@@ -100,7 +105,6 @@ class EmbeddingContext:
|
|
100 |
|
101 |
def encode_openai(self, texts_to_embed):
|
102 |
import math
|
103 |
-
import openai
|
104 |
import time
|
105 |
|
106 |
tokens_count = 0
|
@@ -116,8 +120,11 @@ class EmbeddingContext:
|
|
116 |
end = start + chunk_size
|
117 |
chunk = texts_to_embed[start:end]
|
118 |
|
119 |
-
embeddings_tmp =
|
120 |
-
|
|
|
|
|
|
|
121 |
if embeddings_tmp is None:
|
122 |
break
|
123 |
|
@@ -126,7 +133,7 @@ class EmbeddingContext:
|
|
126 |
if i < chunks_num - 1:
|
127 |
time.sleep(60) # Wait 1 minute before the next call
|
128 |
|
129 |
-
return torch.stack([torch.tensor(embedding
|
130 |
|
131 |
def get_tokens(self, text):
|
132 |
if self.model:
|
@@ -204,18 +211,20 @@ class EmbeddingContext:
|
|
204 |
issues = gitea_fetch_issues(
|
205 |
owner, repo, since=date_old, issue_attr_filter=self.issue_attr_filter, exclude=black_list)
|
206 |
|
207 |
-
#
|
208 |
-
|
209 |
-
issues = [
|
210 |
-
issue for issue in issues if issue['updated_at'] != date_old]
|
211 |
|
212 |
-
if
|
|
|
213 |
return data
|
214 |
|
215 |
-
|
216 |
-
date_new = _find_latest_date(issues, date_old)
|
217 |
|
218 |
# autopep8: off
|
|
|
|
|
|
|
|
|
219 |
numbers_old = data['numbers']
|
220 |
titles_old = data['titles']
|
221 |
embeddings_old = data['embeddings']
|
@@ -239,6 +248,9 @@ class EmbeddingContext:
|
|
239 |
old_closed.append(i_old)
|
240 |
break
|
241 |
|
|
|
|
|
|
|
242 |
mask_open = torch.ones(len(numbers_open), dtype=torch.bool)
|
243 |
need_sort = False
|
244 |
change_map = []
|
@@ -286,10 +298,13 @@ class EmbeddingContext:
|
|
286 |
i_new += 1
|
287 |
|
288 |
assert i_new == total
|
289 |
-
|
290 |
titles_new = titles_old + [issue['title'] for i, issue in enumerate(issues_open) if mask_open[i]]
|
291 |
numbers_new = numbers_old + [number for i, number in enumerate(numbers_open) if mask_open[i]]
|
292 |
embeddings_new = torch.cat([embeddings_old, embeddings[mask_open]])
|
|
|
|
|
|
|
293 |
|
294 |
if need_sort:
|
295 |
sorted_indices = sorted(range(len(numbers_new)), key=lambda k: numbers_new[k])
|
@@ -297,7 +312,6 @@ class EmbeddingContext:
|
|
297 |
numbers_new = [numbers_new[i] for i in sorted_indices]
|
298 |
embeddings_new = embeddings_new[sorted_indices]
|
299 |
|
300 |
-
data['updated_at'] = date_new
|
301 |
data['titles'] = titles_new
|
302 |
data['numbers'] = numbers_new
|
303 |
data['embeddings'] = embeddings_new
|
|
|
62 |
# Set when creating the object
|
63 |
lock = None
|
64 |
model = None
|
65 |
+
openai_client = None
|
66 |
model_name = ''
|
67 |
config_type = ''
|
68 |
|
|
|
87 |
self.model = self.model.to('cuda')
|
88 |
|
89 |
elif config_type == 'openai':
|
90 |
+
from openai import OpenAI
|
91 |
+
self.openai_client = OpenAI(
|
92 |
+
# base_url = settings.openai_api_base
|
93 |
+
api_key=settings.OPENAI_API_KEY,
|
94 |
+
)
|
95 |
self.encode = self.encode_openai
|
96 |
|
97 |
self.model_name = model_name
|
|
|
105 |
|
106 |
def encode_openai(self, texts_to_embed):
|
107 |
import math
|
|
|
108 |
import time
|
109 |
|
110 |
tokens_count = 0
|
|
|
120 |
end = start + chunk_size
|
121 |
chunk = texts_to_embed[start:end]
|
122 |
|
123 |
+
embeddings_tmp = self.openai_client.embeddings.create(
|
124 |
+
model=self.model_name,
|
125 |
+
input=chunk,
|
126 |
+
).data
|
127 |
+
|
128 |
if embeddings_tmp is None:
|
129 |
break
|
130 |
|
|
|
133 |
if i < chunks_num - 1:
|
134 |
time.sleep(60) # Wait 1 minute before the next call
|
135 |
|
136 |
+
return torch.stack([torch.tensor(embedding.embedding, dtype=torch.float32) for embedding in embeddings])
|
137 |
|
138 |
def get_tokens(self, text):
|
139 |
if self.model:
|
|
|
211 |
issues = gitea_fetch_issues(
|
212 |
owner, repo, since=date_old, issue_attr_filter=self.issue_attr_filter, exclude=black_list)
|
213 |
|
214 |
+
# Get the most recent date
|
215 |
+
date_new = _find_latest_date(issues, date_old)
|
|
|
|
|
216 |
|
217 |
+
if date_new == date_old:
|
218 |
+
# Nothing changed
|
219 |
return data
|
220 |
|
221 |
+
data['updated_at'] = date_new
|
|
|
222 |
|
223 |
# autopep8: off
|
224 |
+
# WORKAROUND:
|
225 |
+
# Consider that if the time hasn't changed, it's the same issue.
|
226 |
+
issues = [issue for issue in issues if issue['updated_at'] != date_old]
|
227 |
+
|
228 |
numbers_old = data['numbers']
|
229 |
titles_old = data['titles']
|
230 |
embeddings_old = data['embeddings']
|
|
|
248 |
old_closed.append(i_old)
|
249 |
break
|
250 |
|
251 |
+
if not old_closed and not issues_open:
|
252 |
+
return data
|
253 |
+
|
254 |
mask_open = torch.ones(len(numbers_open), dtype=torch.bool)
|
255 |
need_sort = False
|
256 |
change_map = []
|
|
|
298 |
i_new += 1
|
299 |
|
300 |
assert i_new == total
|
301 |
+
elif mask_open.any():
|
302 |
titles_new = titles_old + [issue['title'] for i, issue in enumerate(issues_open) if mask_open[i]]
|
303 |
numbers_new = numbers_old + [number for i, number in enumerate(numbers_open) if mask_open[i]]
|
304 |
embeddings_new = torch.cat([embeddings_old, embeddings[mask_open]])
|
305 |
+
else:
|
306 |
+
# Only Updated Data changed
|
307 |
+
return data
|
308 |
|
309 |
if need_sort:
|
310 |
sorted_indices = sorted(range(len(numbers_new)), key=lambda k: numbers_new[k])
|
|
|
312 |
numbers_new = [numbers_new[i] for i in sorted_indices]
|
313 |
embeddings_new = embeddings_new[sorted_indices]
|
314 |
|
|
|
315 |
data['titles'] = titles_new
|
316 |
data['numbers'] = numbers_new
|
317 |
data['embeddings'] = embeddings_new
|