Spaces:

mano-wii
/

tools

Running

App Files Files

Germano Cavalcante commited on Jul 20, 2024

Commit

ed15883

1 Parent(s): 5def575

New tool Wiki Search

Browse files

Searches the manual or any other embed information in order to find a
context for the information.

Files changed (8) hide show

main.py +4 -1
routers/embedding/__init__.py +114 -0
routers/{tool_find_related_cache.pkl → embedding/embeddings_issues.pkl} +2 -2
routers/embedding/embeddings_manual.pkl +3 -0
routers/tool_calls.py +4 -0
routers/tool_find_related.py +143 -262
routers/tool_wiki_search.py +295 -0
utils/generate_blender_doc.py +0 -194

main.py CHANGED Viewed

@@ -6,7 +6,7 @@ from fastapi.responses import HTMLResponse
 from fastapi.staticfiles import StaticFiles
 from huggingface_hub import login
 from config import settings
-from routers import tool_bpy_doc, tool_gpu_checker, tool_calls, tool_find_related
 login(settings.huggingface_key)
@@ -30,6 +30,9 @@ app.include_router(
 app.include_router(
     tool_find_related.router, prefix="/api/v1", tags=["Tools"])
 app.include_router(
     tool_calls.router, prefix="/api/v1", tags=["Function Calls"])

 from fastapi.staticfiles import StaticFiles
 from huggingface_hub import login
 from config import settings
+from routers import tool_bpy_doc, tool_gpu_checker, tool_calls, tool_find_related, tool_wiki_search
 login(settings.huggingface_key)
 app.include_router(
     tool_find_related.router, prefix="/api/v1", tags=["Tools"])
+app.include_router(
+    tool_wiki_search.router, prefix="/api/v1", tags=["Tools"])
 app.include_router(
     tool_calls.router, prefix="/api/v1", tags=["Function Calls"])

routers/embedding/__init__.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# routers/embedding/__init__.py
+import os
+import sys
+import threading
+import torch
+from sentence_transformers import SentenceTransformer, util
+class EmbeddingContext:
+    # These don't change
+    TOKEN_LEN_MAX_FOR_EMBEDDING = 512
+    # Set when creating the object
+    lock = None
+    model = None
+    openai_client = None
+    model_name = ''
+    config_type = ''
+    embedding_shape = None
+    embedding_dtype = None
+    embedding_device = None
+    # Updates constantly
+    data = {}
+    def __init__(self):
+        try:
+            from config import settings
+        except:
+            sys.path.append(os.path.abspath(
+                os.path.join(os.path.dirname(__file__), '../..')))
+            from config import settings
+        self.lock = threading.Lock()
+        config_type = settings.embedding_api
+        model_name = settings.embedding_model
+        if config_type == 'sbert':
+            self.model = SentenceTransformer(model_name, use_auth_token=False)
+            self.model.max_seq_length = self.TOKEN_LEN_MAX_FOR_EMBEDDING
+            print("Max Sequence Length:", self.model.max_seq_length)
+            self.encode = self.encode_sbert
+            if torch.cuda.is_available():
+                self.model = self.model.to('cuda')
+        elif config_type == 'openai':
+            from openai import OpenAI
+            self.openai_client = OpenAI(
+                # base_url = settings.openai_api_base
+                api_key=settings.OPENAI_API_KEY,
+            )
+            self.encode = self.encode_openai
+        self.model_name = model_name
+        self.config_type = config_type
+        tmp = self.encode(['tmp'])
+        self.embedding_shape = tmp.shape[1:]
+        self.embedding_dtype = tmp.dtype
+        self.embedding_device = tmp.device
+    def encode(self, texts_to_embed):
+        pass
+    def encode_sbert(self, texts_to_embed):
+        return self.model.encode(texts_to_embed, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
+    def encode_openai(self, texts_to_embed):
+        import math
+        import time
+        tokens_count = 0
+        for text in texts_to_embed:
+            tokens_count += len(self.get_tokens(text))
+            chunks_num = math.ceil(tokens_count / 500000)
+            chunk_size = math.ceil(len(texts_to_embed) / chunks_num)
+            embeddings = []
+            for i in range(chunks_num):
+                start = i * chunk_size
+                end = start + chunk_size
+                chunk = texts_to_embed[start:end]
+                embeddings_tmp = self.openai_client.embeddings.create(
+                    model=self.model_name,
+                    input=chunk,
+                ).data
+                if embeddings_tmp is None:
+                    break
+                embeddings.extend(embeddings_tmp)
+                if i < chunks_num - 1:
+                    time.sleep(60)  # Wait 1 minute before the next call
+            return torch.stack([torch.tensor(embedding.embedding, dtype=torch.float32) for embedding in embeddings])
+    def get_tokens(self, text):
+        if self.model:
+            return self.model.tokenizer.tokenize(text)
+        tokens = []
+        for token in re.split(r'(\W|\b)', text):
+            if token.strip():
+                tokens.append(token)
+        return tokens
+EMBEDDING_CTX = EmbeddingContext()

routers/{tool_find_related_cache.pkl → embedding/embeddings_issues.pkl} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a181cc69d535d6502588e4c14bea367d74dfaca17a5602a23a72def479f592cc
-size 723433353

 version https://git-lfs.github.com/spec/v1
+oid sha256:7c3c012a8f86440dacedd6f1e4e9ea9f41f096031c0ac1ed5cdf64a9a8d46e42
+size 723452942

routers/embedding/embeddings_manual.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ed7475fc8ffda0d9e9deb6480b7152b53657f0fe6a6140bcb60360e425e7a01
+size 18659241

routers/tool_calls.py CHANGED Viewed

@@ -8,10 +8,12 @@ try:
     from .tool_gpu_checker import gpu_checker_get_message
     from .tool_bpy_doc import bpy_doc_get_documentation
     from .tool_find_related import find_relatedness
 except:
     from tool_gpu_checker import gpu_checker_get_message
     from tool_bpy_doc import bpy_doc_get_documentation
     from tool_find_related import find_relatedness
 class ToolCallFunction(BaseModel):
@@ -43,6 +45,8 @@ def process_tool_call(tool_call: ToolCallInput) -> Dict:
         elif function_name == "find_related":
             output["output"] = find_relatedness(
                 function_args["repo"], function_args["number"])
     except json.JSONDecodeError as e:
         error_message = f"Malformed JSON encountered at position {e.pos}: {e.msg}\n {tool_call.function.arguments}"
         output["output"] = error_message

     from .tool_gpu_checker import gpu_checker_get_message
     from .tool_bpy_doc import bpy_doc_get_documentation
     from .tool_find_related import find_relatedness
+    from .tool_wiki_search import wiki_search
 except:
     from tool_gpu_checker import gpu_checker_get_message
     from tool_bpy_doc import bpy_doc_get_documentation
     from tool_find_related import find_relatedness
+    from .tool_wiki_search import wiki_search
 class ToolCallFunction(BaseModel):
         elif function_name == "find_related":
             output["output"] = find_relatedness(
                 function_args["repo"], function_args["number"])
+        elif function_name == "wiki_search":
+            output["output"] = wiki_search(function_args["query"])
     except json.JSONDecodeError as e:
         error_message = f"Malformed JSON encountered at position {e.pos}: {e.msg}\n {tool_call.function.arguments}"
         output["output"] = error_message

routers/tool_find_related.py CHANGED Viewed

@@ -1,22 +1,39 @@
-# find_related.py
 import os
 import pickle
-import re
 import torch
-import threading
 from datetime import datetime, timedelta
 from enum import Enum
-from sentence_transformers import SentenceTransformer, util
 from fastapi import APIRouter
 try:
     from .utils_gitea import gitea_fetch_issues, gitea_json_issue_get, gitea_issues_body_updated_at_get
 except:
     from utils_gitea import gitea_fetch_issues, gitea_json_issue_get, gitea_issues_body_updated_at_get
 def _create_issue_string(title, body):
     cleaned_body = body.replace('\r', '')
     cleaned_body = cleaned_body.replace('**System Information**\n', '')
@@ -51,283 +68,149 @@ def _find_latest_date(issues, default_str=None):
     return max((issue['updated_at'] for issue in issues), default=default_str)
-class EmbeddingContext:
-    # These don't change
-    TOKEN_LEN_MAX_FOR_EMBEDDING = 512
-    TOKEN_LEN_MAX_BALCKLIST = 2 * TOKEN_LEN_MAX_FOR_EMBEDDING
-    ARRAY_CHUNK_SIZE = 4096
-    issue_attr_filter = {'number', 'title', 'body',
-                         'state', 'updated_at', 'created_at'}
-    cache_path = "routers/tool_find_related_cache.pkl"
-    # Set when creating the object
-    lock = None
-    model = None
-    openai_client = None
-    model_name = ''
-    config_type = ''
-    embedding_shape = None
-    embedding_dtype = None
-    embedding_device = None
-    # Updates constantly
-    data = {}
-    black_list = {'blender': {109399, 113157, 114706},
-                  'blender-addons': set()}
-    def __init__(self):
-        self.lock = threading.Lock()
-        try:
-            from config import settings
-        except:
-            import sys
-            sys.path.append(os.path.abspath(
-                os.path.join(os.path.dirname(__file__), '..')))
-            from config import settings
-        config_type = settings.embedding_api
-        model_name = settings.embedding_model
-        if config_type == 'sbert':
-            self.model = SentenceTransformer(model_name, use_auth_token=False)
-            self.model.max_seq_length = self.TOKEN_LEN_MAX_FOR_EMBEDDING
-            print("Max Sequence Length:", self.model.max_seq_length)
-            self.encode = self.encode_sbert
-            if torch.cuda.is_available():
-                self.model = self.model.to('cuda')
-        elif config_type == 'openai':
-            from openai import OpenAI
-            self.openai_client = OpenAI(
-                # base_url = settings.openai_api_base
-                api_key=settings.OPENAI_API_KEY,
-            )
-            self.encode = self.encode_openai
-        self.model_name = model_name
-        self.config_type = config_type
-        tmp = self.encode(['tmp'])
-        self.embedding_shape = tmp.shape[1:]
-        self.embedding_dtype = tmp.dtype
-        self.embedding_device = tmp.device
-    def encode(self, texts_to_embed):
-        pass
-    def encode_sbert(self, texts_to_embed):
-        return self.model.encode(texts_to_embed, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
-    def encode_openai(self, texts_to_embed):
-        import math
-        import time
-        tokens_count = 0
-        for text in texts_to_embed:
-            tokens_count += len(self.get_tokens(text))
-            chunks_num = math.ceil(tokens_count / 500000)
-            chunk_size = math.ceil(len(texts_to_embed) / chunks_num)
-            embeddings = []
-            for i in range(chunks_num):
-                start = i * chunk_size
-                end = start + chunk_size
-                chunk = texts_to_embed[start:end]
-                embeddings_tmp = self.openai_client.embeddings.create(
-                    model=self.model_name,
-                    input=chunk,
-                ).data
-                if embeddings_tmp is None:
-                    break
-                embeddings.extend(embeddings_tmp)
-                if i < chunks_num - 1:
-                    time.sleep(60)  # Wait 1 minute before the next call
-            return torch.stack([torch.tensor(embedding.embedding, dtype=torch.float32) for embedding in embeddings])
-    def get_tokens(self, text):
-        if self.model:
-            return self.model.tokenizer.tokenize(text)
-        tokens = []
-        for token in re.split(r'(\W|\b)', text):
-            if token.strip():
-                tokens.append(token)
-        return tokens
-    def create_strings_to_embbed(self, issues, black_list):
-        texts_to_embed = [_create_issue_string(
-            issue['title'], issue['body']) for issue in issues]
-        # Create issue blacklist (for keepping track)
-        token_count = 0
-        for i, text in enumerate(texts_to_embed):
-            tokens = self.get_tokens(text)
-            tokens_len = len(tokens)
-            token_count += tokens_len
-            if tokens_len > self.TOKEN_LEN_MAX_BALCKLIST:
-                # Only use the first TOKEN_LEN_MAX tokens
-                black_list.add(int(issues[i]['number']))
-                if self.config_type == 'openai':
-                    texts_to_embed[i] = ' '.join(
-                        tokens[:self.TOKEN_LEN_MAX_BALCKLIST])
-        return texts_to_embed
-    def data_ensure_size(self, repo, size_new):
-        updated_at_old = None
-        arrays_size_old = 0
-        titles_old = []
-        try:
-            arrays_size_old = self.data[repo]['arrays_size']
-            if size_new <= arrays_size_old:
-                return
-        except:
-            pass
-        arrays_size_new = self.ARRAY_CHUNK_SIZE * \
-            (int(size_new / self.ARRAY_CHUNK_SIZE) + 1)
-        data_new = {
-            'updated_at': updated_at_old,
-            'arrays_size': arrays_size_new,
-            'titles': titles_old + [None] * (arrays_size_new - arrays_size_old),
-            'embeddings': torch.empty((arrays_size_new, *self.embedding_shape),
-                                      dtype=self.embedding_dtype,
-                                      device=self.embedding_device),
-            'opened':  torch.zeros(arrays_size_new, dtype=torch.bool),
-            'closed':  torch.zeros(arrays_size_new, dtype=torch.bool),
-        }
         try:
-            data_new['embeddings'][:arrays_size_old] = self.data[repo]['embeddings']
-            data_new['opened'][:arrays_size_old] = self.data[repo]['opened']
-            data_new['closed'][:arrays_size_old] = self.data[repo]['closed']
         except:
-            pass
-        self.data[repo] = data_new
-    def embeddings_generate(self, repo):
-        if os.path.exists(self.cache_path):
-            with open(self.cache_path, 'rb') as file:
-                self.data = pickle.load(file)
-                if repo in self.data:
-                    return
-        if not repo in self.black_list:
-            self.black_list[repo] = {}
-        black_list = self.black_list[repo]
-        issues = gitea_fetch_issues('blender', repo, state='all', since=None,
-                                    issue_attr_filter=self.issue_attr_filter, exclude=black_list)
-        # issues = sorted(issues, key=lambda issue: int(issue['number']))
-        print("Embedding Issues...")
-        texts_to_embed = self.create_strings_to_embbed(issues, black_list)
-        embeddings = self.encode(texts_to_embed)
-        self.data_ensure_size(repo, int(issues[0]['number']))
-        self.data[repo]['updated_at'] = _find_latest_date(issues)
-        titles = self.data[repo]['titles']
-        embeddings_new = self.data[repo]['embeddings']
-        opened = self.data[repo]['opened']
-        closed = self.data[repo]['closed']
         for i, issue in enumerate(issues):
             number = int(issue['number'])
-            titles[number] = issue['title']
-            embeddings_new[number] = embeddings[i]
             if issue['state'] == 'open':
-                opened[number] = True
             if issue['state'] == 'closed':
-                closed[number] = True
-    def embeddings_updated_get(self, repo):
-        with self.lock:
-            try:
-                data = self.data[repo]
-            except:
-                self.embeddings_generate(repo)
-                data = self.data[repo]
-            black_list = self.black_list[repo]
-            date_old = data['updated_at']
-            issues = gitea_fetch_issues(
-                'blender', repo, since=date_old, issue_attr_filter=self.issue_attr_filter, exclude=black_list)
-            # Get the most recent date
-            date_new = _find_latest_date(issues, date_old)
-            if date_new == date_old:
-                # Nothing changed
-                return data
-            data['updated_at'] = date_new
-# autopep8: off
-            # Consider that if the time hasn't changed, it's the same issue.
-            issues = [issue for issue in issues if issue['updated_at'] != date_old]
-            self.data_ensure_size(repo, int(issues[0]['number']))
-            updated_at = gitea_issues_body_updated_at_get(issues)
-            issues_to_embed = []
-            for i, issue in enumerate(issues):
                 number = int(issue['number'])
-                if issue['state'] == 'open':
-                    data['opened'][number] = True
-                if issue['state'] == 'closed':
-                    data['closed'][number] = True
-                title_old = data['titles'][number]
-                if title_old != issue['title']:
-                    data['titles'][number] = issue['title']
-                    issues_to_embed.append(issue)
-                elif updated_at[i] >= date_old:
-                    issues_to_embed.append(issue)
-            if issues_to_embed:
-                texts_to_embed = self.create_strings_to_embbed(issues_to_embed, black_list)
-                embeddings = self.encode(texts_to_embed)
-                for i, issue in enumerate(issues_to_embed):
-                    number = int(issue['number'])
-                    data['embeddings'][number] = embeddings[i]
 # autopep8: on
-        return data
-router = APIRouter()
-EMBEDDING_CTX = EmbeddingContext()
-# EMBEDDING_CTX.embeddings_generate('blender', 'blender')
-# EMBEDDING_CTX.embeddings_generate('blender', 'blender-addons')
-# Define your Enum class
-class State(str, Enum):
-    opened = "opened"
-    closed = "closed"
-    all = "all"
 def _sort_similarity(data: dict,
-                     query_emb: torch.Tensor,
                      limit: int,
                      state: State = State.opened) -> list:
     duplicates = []
@@ -356,7 +239,7 @@ def _sort_similarity(data: dict,
 def find_relatedness(repo: str, number: int, limit: int = 20, state: State = State.opened):
-    data = EMBEDDING_CTX.embeddings_updated_get(repo)
     # Check if the embedding already exists.
     if data['titles'][number] is not None:
@@ -383,7 +266,7 @@ def find_relatedness(repo: str, number: int, limit: int = 20, state: State = Sta
 @router.get("/find_related/{repo}/{number}")
-def find_related(repo: str = 'blender', number: int = 104399, limit: int = 15, state: State = State.opened):
     related = find_relatedness(repo, number, limit=limit, state=state)
     return related
@@ -391,28 +274,26 @@ def find_related(repo: str = 'blender', number: int = 104399, limit: int = 15, s
 if __name__ == "__main__":
     update_cache = True
     if update_cache:
-        EMBEDDING_CTX.embeddings_updated_get('blender')
-        EMBEDDING_CTX.embeddings_updated_get('blender-addons')
-        cache_path = EMBEDDING_CTX.cache_path
-        with open(cache_path, "wb") as file:
             # Converting the embeddings to be CPU compatible, as the virtual machine in use currently only supports the CPU.
-            for val in EMBEDDING_CTX.data.values():
                 val['embeddings'] = val['embeddings'].to(torch.device('cpu'))
-            pickle.dump(EMBEDDING_CTX.data, file,
-                        protocol=pickle.HIGHEST_PROTOCOL)
-    else:
-        # Converting the embeddings to be GPU.
-        for val in EMBEDDING_CTX.data.values():
-            val['embeddings'] = val['embeddings'].to(torch.device('cuda'))
-        # 'blender/blender/111434' must print #96153, #83604 and #79762
-        related1 = find_relatedness(
-            'blender', 111434, limit=20, state=State.all)
-        related2 = find_relatedness('blender-addons', 104399, limit=20)
-        print("These are the 20 most related issues:")
-        print(related1)
-        print()
-        print("These are the 20 most related issues:")
-        print(related2)

+# routers/find_related.py
 import os
 import pickle
 import torch
+import re
+from typing import List
 from datetime import datetime, timedelta
 from enum import Enum
+from sentence_transformers import util
 from fastapi import APIRouter
 try:
+    from .embedding import EMBEDDING_CTX
     from .utils_gitea import gitea_fetch_issues, gitea_json_issue_get, gitea_issues_body_updated_at_get
 except:
+    from embedding import EMBEDDING_CTX
     from utils_gitea import gitea_fetch_issues, gitea_json_issue_get, gitea_issues_body_updated_at_get
+router = APIRouter()
+issue_attr_filter = {'number', 'title', 'body',
+                     'state', 'updated_at', 'created_at'}
+G_cache_path = "routers/embedding/embeddings_issues.pkl"
+G_data = {}
+class State(str, Enum):
+    opened = "opened"
+    closed = "closed"
+    all = "all"
 def _create_issue_string(title, body):
     cleaned_body = body.replace('\r', '')
     cleaned_body = cleaned_body.replace('**System Information**\n', '')
     return max((issue['updated_at'] for issue in issues), default=default_str)
+def _create_strings_to_embbed(issues):
+    texts_to_embed = [_create_issue_string(
+        issue['title'], issue['body']) for issue in issues]
+    return texts_to_embed
+def _data_ensure_size(repo, size_new):
+    global G_data
+    ARRAY_CHUNK_SIZE = 4096
+    updated_at_old = None
+    arrays_size_old = 0
+    titles_old = []
+    try:
+        arrays_size_old = G_data[repo]['arrays_size']
+        if size_new <= arrays_size_old:
+            return
+    except:
+        pass
+    arrays_size_new = ARRAY_CHUNK_SIZE * (int(size_new / ARRAY_CHUNK_SIZE) + 1)
+    data_new = {
+        'updated_at': updated_at_old,
+        'arrays_size': arrays_size_new,
+        'titles': titles_old + [None] * (arrays_size_new - arrays_size_old),
+        'embeddings': torch.empty((arrays_size_new, *EMBEDDING_CTX.embedding_shape),
+                                  dtype=EMBEDDING_CTX.embedding_dtype,
+                                  device=EMBEDDING_CTX.embedding_device),
+        'opened':  torch.zeros(arrays_size_new, dtype=torch.bool),
+        'closed':  torch.zeros(arrays_size_new, dtype=torch.bool),
+    }
+    try:
+        data_new['embeddings'][:arrays_size_old] = G_data[repo]['embeddings']
+        data_new['opened'][:arrays_size_old] = G_data[repo]['opened']
+        data_new['closed'][:arrays_size_old] = G_data[repo]['closed']
+    except:
+        pass
+    G_data[repo] = data_new
+def _embeddings_generate(repo):
+    global G_data
+    if os.path.exists(G_cache_path):
+        with open(G_cache_path, 'rb') as file:
+            G_data = pickle.load(file)
+            if repo in G_data:
+                return
+    issues = gitea_fetch_issues('blender', repo, state='all', since=None,
+                                issue_attr_filter=issue_attr_filter)
+    # issues = sorted(issues, key=lambda issue: int(issue['number']))
+    print("Embedding Issues...")
+    texts_to_embed = _create_strings_to_embbed(issues)
+    embeddings = EMBEDDING_CTX.encode(texts_to_embed)
+    _data_ensure_size(repo, int(issues[0]['number']))
+    G_data[repo]['updated_at'] = _find_latest_date(issues)
+    titles = G_data[repo]['titles']
+    embeddings_new = G_data[repo]['embeddings']
+    opened = G_data[repo]['opened']
+    closed = G_data[repo]['closed']
+    for i, issue in enumerate(issues):
+        number = int(issue['number'])
+        titles[number] = issue['title']
+        embeddings_new[number] = embeddings[i]
+        if issue['state'] == 'open':
+            opened[number] = True
+        if issue['state'] == 'closed':
+            closed[number] = True
+def _embeddings_updated_get(repo):
+    global G_data
+    with EMBEDDING_CTX.lock:
         try:
+            data_repo = G_data[repo]
         except:
+            _embeddings_generate(repo)
+            data_repo = G_data[repo]
+        date_old = data_repo['updated_at']
+        issues = gitea_fetch_issues(
+            'blender', repo, since=date_old, issue_attr_filter=issue_attr_filter)
+        # Get the most recent date
+        date_new = _find_latest_date(issues, date_old)
+        if date_new == date_old:
+            # Nothing changed
+            return data_repo
+        data_repo['updated_at'] = date_new
+# autopep8: off
+        # Consider that if the time hasn't changed, it's the same issue.
+        issues = [issue for issue in issues if issue['updated_at'] != date_old]
+        _data_ensure_size(repo, int(issues[0]['number']))
+        updated_at = gitea_issues_body_updated_at_get(issues)
+        issues_to_embed = []
         for i, issue in enumerate(issues):
             number = int(issue['number'])
             if issue['state'] == 'open':
+                data_repo['opened'][number] = True
             if issue['state'] == 'closed':
+                data_repo['closed'][number] = True
+            title_old = data_repo['titles'][number]
+            if title_old != issue['title']:
+                data_repo['titles'][number] = issue['title']
+                issues_to_embed.append(issue)
+            elif updated_at[i] >= date_old:
+                issues_to_embed.append(issue)
+        if issues_to_embed:
+            print(f"Embedding {len(issues_to_embed)} issue{'s' if len(issues_to_embed) > 1 else ''}")
+            texts_to_embed = _create_strings_to_embbed(issues_to_embed)
+            embeddings = EMBEDDING_CTX.encode(texts_to_embed)
+            for i, issue in enumerate(issues_to_embed):
                 number = int(issue['number'])
+                data_repo['embeddings'][number] = embeddings[i]
 # autopep8: on
+    return data_repo
 def _sort_similarity(data: dict,
+                     query_emb: List[torch.Tensor],
                      limit: int,
                      state: State = State.opened) -> list:
     duplicates = []
 def find_relatedness(repo: str, number: int, limit: int = 20, state: State = State.opened):
+    data = _embeddings_updated_get(repo)
     # Check if the embedding already exists.
     if data['titles'][number] is not None:
 @router.get("/find_related/{repo}/{number}")
+def find_related(repo: str = 'blender', number: int = 104399, limit: int = 15, state: State = State.opened) -> str:
     related = find_relatedness(repo, number, limit=limit, state=state)
     return related
 if __name__ == "__main__":
     update_cache = True
     if update_cache:
+        _embeddings_updated_get('blender')
+        _embeddings_updated_get('blender-addons')
+        with open(G_cache_path, "wb") as file:
             # Converting the embeddings to be CPU compatible, as the virtual machine in use currently only supports the CPU.
+            for val in G_data.values():
                 val['embeddings'] = val['embeddings'].to(torch.device('cpu'))
+            pickle.dump(G_data, file, protocol=pickle.HIGHEST_PROTOCOL)
+    # Converting the embeddings to be GPU.
+    for val in G_data.values():
+        val['embeddings'] = val['embeddings'].to(torch.device('cuda'))
+    # 'blender/blender/111434' must print #96153, #83604 and #79762
+    related1 = find_relatedness(
+        'blender', 111434, limit=20, state=State.all)
+    related2 = find_relatedness('blender-addons', 104399, limit=20)
+    print("These are the 20 most related issues:")
+    print(related1)
+    print()
+    print("These are the 20 most related issues:")
+    print(related2)

routers/tool_wiki_search.py ADDED Viewed

	@@ -0,0 +1,295 @@

+# routers/wiki_search.py
+import os
+import pickle
+import re
+from typing import Dict, List
+from sentence_transformers import util
+from fastapi import APIRouter
+try:
+    from .embedding import EMBEDDING_CTX
+except:
+    from embedding import EMBEDDING_CTX
+router = APIRouter()
+MANUAL_DIR = "D:/BlenderDev/blender-manual/manual/"
+BASE_URL = "https://docs.blender.org/manual/en/dev"
+G_cache_path = "routers/embedding/embeddings_manual.pkl"
+G_data = None
+def _embeddings_generate():
+    global G_data
+    if os.path.exists(G_cache_path):
+        with open(G_cache_path, 'rb') as file:
+            G_data = pickle.load(file)
+            return G_data
+    # path = 'addons/3d_view'
+    G_data = parse_file_recursive(MANUAL_DIR, 'index.rst')
+    G_data['toctree']["copyright"] = parse_file_recursive(
+        MANUAL_DIR, 'copyright.rst')
+    # Create a list to store the text files
+    texts = get_texts_recursive(data)
+    print("Embedding Texts...")
+    G_data['texts'] = texts
+    G_data['embeddings'] = EMBEDDING_CTX.encode(texts)
+    with open(self.cache_path, "wb") as file:
+        # Converting the embeddings to be CPU compatible, as the virtual machine in use currently only supports the CPU.
+        G_data['embeddings'] = G_data['embeddings'].to(
+            torch.device('cpu'))
+        pickle.dump(G_data, file, protocol=pickle.HIGHEST_PROTOCOL)
+    return G_data
+def reduce_text(text):
+    # Remove repeated characters
+    text = re.sub(r'%{2,}', '', text)  # Title
+    text = re.sub(r'#{2,}', '', text)  # Title
+    text = re.sub(r'\*{3,}', '', text)  # Title
+    text = re.sub(r'={3,}', '', text)  # Topic
+    text = re.sub(r'\^{3,}', '', text)
+    text = re.sub(r'-{3,}', '', text)
+    text = re.sub(r'(\s*\n\s*)+', '\n', text)
+    return text
+def parse_file_recursive(filedir, filename):
+    with open(os.path.join(filedir, filename), 'r', encoding='utf-8') as file:
+        content = file.read()
+    parsed_data = {}
+    if not filename.endswith('index.rst'):
+        body = content.strip()
+    else:
+        parts = content.split(".. toctree::")
+        body = parts[0].strip()
+        if len(parts) > 1:
+            parsed_data["toctree"] = {}
+            for part in parts[1:]:
+                toctree_entries = part.split('\n')
+                line = toctree_entries[0]
+                for entry in toctree_entries[1:]:
+                    entry = entry.strip()
+                    if not entry:
+                        continue
+                    if entry.startswith('/'):
+                        # relative path.
+                        continue
+                    if not entry.endswith('.rst'):
+                        continue
+                    if entry.endswith('/index.rst'):
+                        entry_name = entry[:-10]
+                        filedir_ = os.path.join(filedir, entry_name)
+                        filename_ = 'index.rst'
+                    else:
+                        entry_name = entry[:-4]
+                        filedir_ = filedir
+                        filename_ = entry
+                    parsed_data['toctree'][entry_name] = parse_file_recursive(
+                        filedir_, filename_)
+    # The '\n' at the end of the file resolves regex patterns
+    parsed_data['body'] = body + '\n'
+    return parsed_data
+def split_into_topics(text: str, prefix: str = '') -> Dict[str, List[str]]:
+    """
+    Splits a text into sections based on titles and subtitles, and organizes them into a dictionary.
+    Args:
+        text (str): The input text to be split. The text should contain titles marked by asterisks (***)
+                    or subtitles marked by equal signs (===).
+        prefix (str): prefix to titles and subtitles
+    Returns:
+        Dict[str, List[str]]: A dictionary where keys are section titles or subtitles, and values are lists of
+                               strings corresponding to the content under each title or subtitle.
+    Example:
+        text = '''
+        *********************
+        The Blender Community
+        *********************
+        Being freely available from the start.
+        Independent Sites
+        =================
+        There are `several independent websites.
+        Getting Support
+        ===============
+        Blender's community is one of its greatest features.
+        '''
+        result = split_in_topics(text)
+        # result will be:
+        # {
+        #     "# The Blender Community": [
+        #         "Being freely available from the start."
+        #     ],
+        #     "# The Blender Community | Independent Sites": [
+        #         "There are `several independent websites."
+        #     ],
+        #     "# The Blender Community | Getting Support": [
+        #         "Blender's community is one of its greatest features."
+        #     ]
+        # }
+    """
+    # Remove patterns ".. word::" and ":word:"
+    text = re.sub(r'\.\. [^\n]+\n+(?: {3,}[^\n]*\n)*|:\w+:', '', text)
+    # Regular expression to find titles and subtitles
+    pattern = r'([\*|#|%]{3,}\n[^\n]+\n[\*|#|%]{3,}|(?:={3,}\n)?[^\n]+\n={3,}\n)'
+    # Split text by found patterns
+    sections = re.split(pattern, text)
+    # Remove possible white spaces at the beginning and end of each section
+    sections = [section for section in sections if section.strip()]
+    # Separate sections into a dictionary
+    topics = {}
+    current_title = ''
+    current_topic = prefix
+    for section in sections:
+        if match := re.match(r'[\*|#|%]{3,}\n([^\n]+)\n[\*|#|%]{3,}', section):
+            current_topic = current_title = f'{prefix}# {match.group(1)}'
+            topics[current_topic] = []
+        elif match := re.match(r'(?:={3,}\n)?([^\n]+)\n={3,}\n', section):
+            current_topic = current_title + ' | ' + match.group(1)
+            topics[current_topic] = []
+        else:
+            if current_topic == prefix:
+                raise
+            topics[current_topic].append(section)
+    return topics
+# Function to split the text into chunks of a maximum number of tokens
+def split_into_many(page_body, prefix=''):
+    tokenizer = EMBEDDING_CTX.model.tokenizer
+    max_tokens = EMBEDDING_CTX.model.max_seq_length
+    topics = split_into_topics(page_body, prefix)
+    for topic, content_list in topics.items():
+        title = topic + ':\n'
+        title_tokens_len = len(tokenizer.tokenize(title))
+        content_list_new = []
+        for content in content_list:
+            content_reduced = reduce_text(content)
+            content_tokens_len = len(tokenizer.tokenize(content_reduced))
+            if title_tokens_len + content_tokens_len <= max_tokens:
+                content_list_new.append(content_reduced)
+                continue
+            # Split the text into sentences
+            paragraphs = content_reduced.split('.\n')
+            sentences = ''
+            tokens_so_far = title_tokens_len
+            # Loop through the sentences and tokens joined together in a tuple
+            for sentence in paragraphs:
+                sentence += '.\n'
+                # Get the number of tokens for each sentence
+                n_tokens = len(tokenizer.tokenize(sentence))
+                # If the number of tokens so far plus the number of tokens in the current sentence is greater
+                # than the max number of tokens, then add the chunk to the list of chunks and reset
+                # the chunk and tokens so far
+                if tokens_so_far + n_tokens > max_tokens:
+                    content_list_new.append(sentences)
+                    sentences = ''
+                    tokens_so_far = title_tokens_len
+                sentences += sentence
+                tokens_so_far += n_tokens
+            if sentences:
+                content_list_new.append(sentences)
+        # Replace content_list
+        content_list.clear()
+        content_list.extend(content_list_new)
+    result = []
+    for topic, content_list in topics.items():
+        for content in content_list:
+            result.append(topic + ':\n' + content)
+    return result
+def get_texts_recursive(page, path=''):
+    result = split_into_many(page['body'], path)
+    try:
+        for key in page['toctree'].keys():
+            page_child = page['toctree'][key]
+            result.extend(get_texts_recursive(page_child, f'{path}/{key}'))
+    except KeyError:
+        pass
+    return result
+def _sort_similarity(data, text_to_search, limit):
+    results = []
+    query_emb = EMBEDDING_CTX.encode([text_to_search])
+    ret = util.semantic_search(
+        query_emb, data['embeddings'], top_k=limit, score_function=util.dot_score)
+    texts = data['texts']
+    for score in ret[0]:
+        corpus_id = score['corpus_id']
+        text = texts[corpus_id]
+        results.append(text)
+    return results
+@router.get("/wiki_search")
+def wiki_search(query: str = "") -> str:
+    data = _embeddings_generate()
+    texts = _sort_similarity(data, query, 5)
+    result = f'BASE_URL: {BASE_URL}\n'
+    for text in texts:
+        index = text.find('#')
+        result += f'''---
+{text[:index] + '.html'}
+{text[index:]}
+'''
+    return result
+if __name__ == '__main__':
+    tests = ["Set Snap Base", "Building the Manual", "Bisect Object"]
+    result = wiki_search(tests[0])
+    print(result)

utils/generate_blender_doc.py DELETED Viewed

@@ -1,194 +0,0 @@
-import os
-import sys
-import re
-from sentence_transformers import util
-script_dir = os.path.dirname(os.path.realpath(__file__))
-parent_dir = os.path.dirname(script_dir)
-sys.path.append(parent_dir)
-# autopep8: off
-from routers.tool_find_related import EMBEDDING_CTX
-# autopep8: on
-MANUAL_DIR = "D:/BlenderDev/blender-manual/manual/"
-BASE_URL = "https://docs.blender.org/manual/en/dev"
-def process_text(text):
-    # Remove repeated characters
-    text = re.sub(r'%{2,}', '', text)
-    text = re.sub(r'#{2,}', '', text)
-    text = re.sub(r'={3,}', '', text)
-    text = re.sub(r'\*{3,}', '', text)
-    text = re.sub(r'\^{3,}', '', text)
-    text = re.sub(r'-{3,}', '', text)
-    # Remove patterns ".. word:: " and ":word:"
-    text = re.sub(r'\.\. \S+', '', text)
-    text = re.sub(r':\w+:', '', text)
-    text = re.sub(r'(\s*\n\s*)+', '\n', text)
-    return text
-def parse_file(filedir, filename):
-    with open(os.path.join(filedir, filename), 'r', encoding='utf-8') as file:
-        content = file.read()
-    parsed_data = {}
-    if not filename.endswith('index.rst'):
-        body = content.strip()
-    else:
-        parts = content.split(".. toctree::")
-        body = parts[0].strip()
-        if len(parts) > 1:
-            parsed_data["toctree"] = {}
-            for part in parts[1:]:
-                toctree_entries = part.split('\n')
-                line = toctree_entries[0]
-                for entry in toctree_entries[1:]:
-                    entry = entry.strip()
-                    if not entry:
-                        continue
-                    if entry.startswith('/'):
-                        # relative path.
-                        continue
-                    if not entry.endswith('.rst'):
-                        continue
-                    if entry.endswith('/index.rst'):
-                        entry_name = entry[:-10]
-                        filedir_ = os.path.join(filedir, entry_name)
-                        filename_ = 'index.rst'
-                    else:
-                        entry_name = entry[:-4]
-                        filedir_ = filedir
-                        filename_ = entry
-                    parsed_data['toctree'][entry_name] = parse_file(
-                        filedir_, filename_)
-    processed_text = process_text(body)
-    tokens = EMBEDDING_CTX.model.tokenizer.tokenize(processed_text)
-    if len(tokens) > EMBEDDING_CTX.model.max_seq_length:
-        pass
-    # parsed_data['body'] = body
-    parsed_data['processed_text'] = processed_text
-    parsed_data['n_tokens'] = len(tokens)
-    return parsed_data
-# Function to split the text into chunks of a maximum number of tokens
-def split_into_many(text, max_tokens):
-    # Split the text into sentences
-    paragraphs = text.split('.\n')
-    # Get the number of tokens for each sentence
-    n_tokens = [len(EMBEDDING_CTX.model.tokenizer.tokenize(" " + sentence))
-                for sentence in paragraphs]
-    chunks = []
-    tokens_so_far = 0
-    chunk = []
-    # Loop through the sentences and tokens joined together in a tuple
-    for sentence, token in zip(paragraphs, n_tokens):
-        # If the number of tokens so far plus the number of tokens in the current sentence is greater
-        # than the max number of tokens, then add the chunk to the list of chunks and reset
-        # the chunk and tokens so far
-        if tokens_so_far + token > max_tokens:
-            chunks.append((".\n".join(chunk) + ".", tokens_so_far))
-            chunk = []
-            tokens_so_far = 0
-        # If the number of tokens in the current sentence is greater than the max number of
-        # tokens, go to the next sentence
-        if token > max_tokens:
-            continue
-        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
-        chunk.append(sentence)
-        tokens_so_far += token + 1
-    if chunk:
-        chunks.append((".\n".join(chunk) + ".", tokens_so_far))
-    return chunks
-def get_texts(data, path):
-    result = []
-    processed_texts = [data['processed_text']]
-    processed_tokens = [data['n_tokens']]
-    max_tokens = EMBEDDING_CTX.model.max_seq_length
-    data_ = data
-    for key in path:
-        data_ = data_['toctree'][key]
-        processed_texts.append(data_['processed_text'])
-        processed_tokens.append(data_['n_tokens'])
-    if processed_tokens[-1] > max_tokens:
-        chunks = split_into_many(processed_texts[-1], max_tokens)
-    else:
-        chunks = [(processed_texts[-1], processed_tokens[-1])]
-    for text, n_tokens in chunks:
-        # Add context to the text if we have space
-        for i in range(len(processed_texts) - 2, -1, -1):
-            n_tokens_parent = processed_tokens[i]
-            if n_tokens + n_tokens_parent >= max_tokens:
-                break
-            text_parent = processed_texts[i]
-            text = text_parent + '\n' + text
-            n_tokens += n_tokens_parent
-        result.append([path, text])
-    try:
-        for key in data_['toctree'].keys():
-            result.extend(get_texts(data, path + [key]))
-    except KeyError:
-        pass
-    return result
-def _sort_similarity(chunks, embeddings, text_to_search, limit):
-    results = []
-    query_emb = EMBEDDING_CTX.encode([text_to_search])
-    ret = util.semantic_search(
-        query_emb, embeddings, top_k=limit, score_function=util.dot_score)
-    for score in ret[0]:
-        corpus_id = score['corpus_id']
-        chunk = chunks[corpus_id]
-        path = chunk[0]
-        results.append(path)
-    return results
-if __name__ == '__main__':
-    # path = 'addons/3d_view'
-    data = parse_file(MANUAL_DIR, 'index.rst')
-    data['toctree']["copyright"] = parse_file(MANUAL_DIR, 'copyright.rst')
-    # Create a list to store the text files
-    chunks = []
-    chunks.extend(get_texts(data, []))
-    embeddings = EMBEDDING_CTX.encode([text for path, text in chunks])
-    result = _sort_similarity(chunks, embeddings, "Set Snap Base", 50)
-    print(result)