Germano Cavalcante commited on
Commit
5def575
·
1 Parent(s): 566fa33

Find Related: Optimize: Avoid creating embeddings for text that has not changed

Browse files
routers/tool_find_related.py CHANGED
@@ -12,9 +12,9 @@ from sentence_transformers import SentenceTransformer, util
12
  from fastapi import APIRouter
13
 
14
  try:
15
- from .utils_gitea import gitea_fetch_issues, gitea_json_issue_get
16
  except:
17
- from utils_gitea import gitea_fetch_issues, gitea_json_issue_get
18
 
19
 
20
  def _create_issue_string(title, body):
@@ -56,7 +56,8 @@ class EmbeddingContext:
56
  TOKEN_LEN_MAX_FOR_EMBEDDING = 512
57
  TOKEN_LEN_MAX_BALCKLIST = 2 * TOKEN_LEN_MAX_FOR_EMBEDDING
58
  ARRAY_CHUNK_SIZE = 4096
59
- issue_attr_filter = {'number', 'title', 'body', 'state', 'updated_at'}
 
60
  cache_path = "routers/tool_find_related_cache.pkl"
61
 
62
  # Set when creating the object
@@ -278,24 +279,36 @@ class EmbeddingContext:
278
  data['updated_at'] = date_new
279
 
280
  # autopep8: off
281
- # WORKAROUND:
282
  # Consider that if the time hasn't changed, it's the same issue.
283
  issues = [issue for issue in issues if issue['updated_at'] != date_old]
284
 
285
  self.data_ensure_size(repo, int(issues[0]['number']))
286
 
287
- texts_to_embed = self.create_strings_to_embbed(issues, black_list)
288
- embeddings = self.encode(texts_to_embed)
289
 
290
  for i, issue in enumerate(issues):
291
  number = int(issue['number'])
292
- data['titles'][number] = issue['title']
293
- data['embeddings'][number] = embeddings[i]
294
  if issue['state'] == 'open':
295
  data['opened'][number] = True
296
  if issue['state'] == 'closed':
297
  data['closed'][number] = True
298
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  # autopep8: on
300
  return data
301
 
 
12
  from fastapi import APIRouter
13
 
14
  try:
15
+ from .utils_gitea import gitea_fetch_issues, gitea_json_issue_get, gitea_issues_body_updated_at_get
16
  except:
17
+ from utils_gitea import gitea_fetch_issues, gitea_json_issue_get, gitea_issues_body_updated_at_get
18
 
19
 
20
  def _create_issue_string(title, body):
 
56
  TOKEN_LEN_MAX_FOR_EMBEDDING = 512
57
  TOKEN_LEN_MAX_BALCKLIST = 2 * TOKEN_LEN_MAX_FOR_EMBEDDING
58
  ARRAY_CHUNK_SIZE = 4096
59
+ issue_attr_filter = {'number', 'title', 'body',
60
+ 'state', 'updated_at', 'created_at'}
61
  cache_path = "routers/tool_find_related_cache.pkl"
62
 
63
  # Set when creating the object
 
279
  data['updated_at'] = date_new
280
 
281
  # autopep8: off
 
282
  # Consider that if the time hasn't changed, it's the same issue.
283
  issues = [issue for issue in issues if issue['updated_at'] != date_old]
284
 
285
  self.data_ensure_size(repo, int(issues[0]['number']))
286
 
287
+ updated_at = gitea_issues_body_updated_at_get(issues)
288
+ issues_to_embed = []
289
 
290
  for i, issue in enumerate(issues):
291
  number = int(issue['number'])
 
 
292
  if issue['state'] == 'open':
293
  data['opened'][number] = True
294
  if issue['state'] == 'closed':
295
  data['closed'][number] = True
296
 
297
+ title_old = data['titles'][number]
298
+ if title_old != issue['title']:
299
+ data['titles'][number] = issue['title']
300
+ issues_to_embed.append(issue)
301
+ elif updated_at[i] >= date_old:
302
+ issues_to_embed.append(issue)
303
+
304
+ if issues_to_embed:
305
+ texts_to_embed = self.create_strings_to_embbed(issues_to_embed, black_list)
306
+ embeddings = self.encode(texts_to_embed)
307
+
308
+ for i, issue in enumerate(issues_to_embed):
309
+ number = int(issue['number'])
310
+ data['embeddings'][number] = embeddings[i]
311
+
312
  # autopep8: on
313
  return data
314
 
routers/tool_find_related_cache.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:178cabd52e35e69184b0e49a0bdae18478e99d1b5cec6f590840e7d7c65576d8
3
- size 723396066
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a181cc69d535d6502588e4c14bea367d74dfaca17a5602a23a72def479f592cc
3
+ size 723433353
routers/utils_gitea.py CHANGED
@@ -87,3 +87,24 @@ def gitea_fetch_issues(owner, repo, state='all', labels='', issue_attr_filter=No
87
  encoded_query_params = urllib.parse.urlencode(query_params)
88
  issues_url = f"{base_url}?{encoded_query_params}"
89
  return url_json_get_all_pages(issues_url, item_filter=issue_attr_filter, exclude=exclude, verbose=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  encoded_query_params = urllib.parse.urlencode(query_params)
88
  issues_url = f"{base_url}?{encoded_query_params}"
89
  return url_json_get_all_pages(issues_url, item_filter=issue_attr_filter, exclude=exclude, verbose=True)
90
+
91
+
92
+ def gitea_issues_body_updated_at_get(issues, verbose=True):
93
+ def fetch_issue(issue):
94
+ number = issue['number']
95
+ if verbose:
96
+ print(f"Fetched issue #{number}")
97
+
98
+ json_data = url_json_get(
99
+ f"https://projects.blender.org/blender/blender/issues/{number}/content-history/list")
100
+ # Verify that the response contains the expected data before trying to access it
101
+ if json_data and json_data['results']:
102
+ return json_data['results'][0]['name'].split('datetime="')[1].split('"')[0]
103
+ else:
104
+ return issue['created_at']
105
+
106
+ with ThreadPoolExecutor() as executor:
107
+ futures = [executor.submit(fetch_issue, issue) for issue in issues]
108
+ all_results = [future.result() for future in as_completed(futures)]
109
+
110
+ return all_results