Germano Cavalcante commited on
Commit
8aab0fd
·
1 Parent(s): c7f8eb7

Fix Openai API for embeddings

Browse files
config.py CHANGED
@@ -4,9 +4,12 @@ import os
4
 
5
  class Settings(BaseSettings):
6
  huggingface_key: str = os.environ.get("huggingface_key")
 
7
  cache_dir: str = "cache"
8
  embedding_api: str = "sbert"
9
  embedding_model: str = "mano-wii/BAAI_bge-base-en-v1.5-tunned-for-blender-issues"
 
 
10
 
11
 
12
  settings = Settings()
 
4
 
5
  class Settings(BaseSettings):
6
  huggingface_key: str = os.environ.get("huggingface_key")
7
+ OPENAI_API_KEY: str = os.environ.get("OPENAI_API_KEY")
8
  cache_dir: str = "cache"
9
  embedding_api: str = "sbert"
10
  embedding_model: str = "mano-wii/BAAI_bge-base-en-v1.5-tunned-for-blender-issues"
11
+ # embedding_api: str = "openai"
12
+ # embedding_model: str = "text-embedding-ada-002"
13
 
14
 
15
  settings = Settings()
requirements-fastapi.txt CHANGED
@@ -3,4 +3,5 @@ uvicorn[standard]
3
  python-multipart
4
  pydantic-settings
5
  huggingface_hub
6
- sentence_transformers
 
 
3
  python-multipart
4
  pydantic-settings
5
  huggingface_hub
6
+ sentence_transformers
7
+ openai
routers/tool_find_related.py CHANGED
@@ -62,6 +62,7 @@ class EmbeddingContext:
62
  # Set when creating the object
63
  lock = None
64
  model = None
 
65
  model_name = ''
66
  config_type = ''
67
 
@@ -86,7 +87,11 @@ class EmbeddingContext:
86
  self.model = self.model.to('cuda')
87
 
88
  elif config_type == 'openai':
89
- # openai.api_base = settings.openai.api_base
 
 
 
 
90
  self.encode = self.encode_openai
91
 
92
  self.model_name = model_name
@@ -100,7 +105,6 @@ class EmbeddingContext:
100
 
101
  def encode_openai(self, texts_to_embed):
102
  import math
103
- import openai
104
  import time
105
 
106
  tokens_count = 0
@@ -116,8 +120,11 @@ class EmbeddingContext:
116
  end = start + chunk_size
117
  chunk = texts_to_embed[start:end]
118
 
119
- embeddings_tmp = openai.Embedding.create(
120
- input=chunk, model=self.model_name)['data']
 
 
 
121
  if embeddings_tmp is None:
122
  break
123
 
@@ -126,7 +133,7 @@ class EmbeddingContext:
126
  if i < chunks_num - 1:
127
  time.sleep(60) # Wait 1 minute before the next call
128
 
129
- return torch.stack([torch.tensor(embedding['embedding'], dtype=torch.float32) for embedding in embeddings])
130
 
131
  def get_tokens(self, text):
132
  if self.model:
@@ -204,18 +211,20 @@ class EmbeddingContext:
204
  issues = gitea_fetch_issues(
205
  owner, repo, since=date_old, issue_attr_filter=self.issue_attr_filter, exclude=black_list)
206
 
207
- # WORKAROUND:
208
- # Consider that if the time hasn't changed, it's the same issue.
209
- issues = [
210
- issue for issue in issues if issue['updated_at'] != date_old]
211
 
212
- if len(issues) == 0:
 
213
  return data
214
 
215
- # Get the most recent date
216
- date_new = _find_latest_date(issues, date_old)
217
 
218
  # autopep8: off
 
 
 
 
219
  numbers_old = data['numbers']
220
  titles_old = data['titles']
221
  embeddings_old = data['embeddings']
@@ -239,6 +248,9 @@ class EmbeddingContext:
239
  old_closed.append(i_old)
240
  break
241
 
 
 
 
242
  mask_open = torch.ones(len(numbers_open), dtype=torch.bool)
243
  need_sort = False
244
  change_map = []
@@ -286,10 +298,13 @@ class EmbeddingContext:
286
  i_new += 1
287
 
288
  assert i_new == total
289
- else:
290
  titles_new = titles_old + [issue['title'] for i, issue in enumerate(issues_open) if mask_open[i]]
291
  numbers_new = numbers_old + [number for i, number in enumerate(numbers_open) if mask_open[i]]
292
  embeddings_new = torch.cat([embeddings_old, embeddings[mask_open]])
 
 
 
293
 
294
  if need_sort:
295
  sorted_indices = sorted(range(len(numbers_new)), key=lambda k: numbers_new[k])
@@ -297,7 +312,6 @@ class EmbeddingContext:
297
  numbers_new = [numbers_new[i] for i in sorted_indices]
298
  embeddings_new = embeddings_new[sorted_indices]
299
 
300
- data['updated_at'] = date_new
301
  data['titles'] = titles_new
302
  data['numbers'] = numbers_new
303
  data['embeddings'] = embeddings_new
 
62
  # Set when creating the object
63
  lock = None
64
  model = None
65
+ openai_client = None
66
  model_name = ''
67
  config_type = ''
68
 
 
87
  self.model = self.model.to('cuda')
88
 
89
  elif config_type == 'openai':
90
+ from openai import OpenAI
91
+ self.openai_client = OpenAI(
92
+ # base_url = settings.openai_api_base
93
+ api_key=settings.OPENAI_API_KEY,
94
+ )
95
  self.encode = self.encode_openai
96
 
97
  self.model_name = model_name
 
105
 
106
  def encode_openai(self, texts_to_embed):
107
  import math
 
108
  import time
109
 
110
  tokens_count = 0
 
120
  end = start + chunk_size
121
  chunk = texts_to_embed[start:end]
122
 
123
+ embeddings_tmp = self.openai_client.embeddings.create(
124
+ model=self.model_name,
125
+ input=chunk,
126
+ ).data
127
+
128
  if embeddings_tmp is None:
129
  break
130
 
 
133
  if i < chunks_num - 1:
134
  time.sleep(60) # Wait 1 minute before the next call
135
 
136
+ return torch.stack([torch.tensor(embedding.embedding, dtype=torch.float32) for embedding in embeddings])
137
 
138
  def get_tokens(self, text):
139
  if self.model:
 
211
  issues = gitea_fetch_issues(
212
  owner, repo, since=date_old, issue_attr_filter=self.issue_attr_filter, exclude=black_list)
213
 
214
+ # Get the most recent date
215
+ date_new = _find_latest_date(issues, date_old)
 
 
216
 
217
+ if date_new == date_old:
218
+ # Nothing changed
219
  return data
220
 
221
+ data['updated_at'] = date_new
 
222
 
223
  # autopep8: off
224
+ # WORKAROUND:
225
+ # Consider that if the time hasn't changed, it's the same issue.
226
+ issues = [issue for issue in issues if issue['updated_at'] != date_old]
227
+
228
  numbers_old = data['numbers']
229
  titles_old = data['titles']
230
  embeddings_old = data['embeddings']
 
248
  old_closed.append(i_old)
249
  break
250
 
251
+ if not old_closed and not issues_open:
252
+ return data
253
+
254
  mask_open = torch.ones(len(numbers_open), dtype=torch.bool)
255
  need_sort = False
256
  change_map = []
 
298
  i_new += 1
299
 
300
  assert i_new == total
301
+ elif mask_open.any():
302
  titles_new = titles_old + [issue['title'] for i, issue in enumerate(issues_open) if mask_open[i]]
303
  numbers_new = numbers_old + [number for i, number in enumerate(numbers_open) if mask_open[i]]
304
  embeddings_new = torch.cat([embeddings_old, embeddings[mask_open]])
305
+ else:
306
+ # Only Updated Data changed
307
+ return data
308
 
309
  if need_sort:
310
  sorted_indices = sorted(range(len(numbers_new)), key=lambda k: numbers_new[k])
 
312
  numbers_new = [numbers_new[i] for i in sorted_indices]
313
  embeddings_new = embeddings_new[sorted_indices]
314
 
 
315
  data['titles'] = titles_new
316
  data['numbers'] = numbers_new
317
  data['embeddings'] = embeddings_new