|
"""Retrieve documentation for a given query.""" |
|
|
|
from pathlib import Path |
|
from typing import Any |
|
from rich.console import Console |
|
from tqdm import tqdm |
|
import numpy as np |
|
from manifest import Manifest |
|
from langchain.text_splitter import MarkdownHeaderTextSplitter |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
console = Console(soft_wrap=True) |
|
|
|
try: |
|
EMBEDDING_MODEL = Manifest( |
|
client_name="openaiembedding", |
|
) |
|
except Exception as e: |
|
console.print(e) |
|
console.print( |
|
"Failed to load embedding model. Likely OPENAI API key is not set. Please set to run document retrieval.", |
|
style="bold red", |
|
) |
|
|
|
|
|
def load_documentation(path: Path) -> dict[str, str]: |
|
"""Load documentation from path.""" |
|
content = {} |
|
for file in path.glob("**/*.md"): |
|
with open(file, "r") as f: |
|
data = f.read() |
|
key = str(file).replace(str(path), "") |
|
content[key] = data |
|
return content |
|
|
|
|
|
def split_documents(content: dict[str, str]) -> dict[str, Any]: |
|
"""Split documents into chunks.""" |
|
md_splitted_docs = [] |
|
markdown_splitter = MarkdownHeaderTextSplitter( |
|
headers_to_split_on=[("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")] |
|
) |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
separators=["\n"], chunk_size=500, chunk_overlap=50, length_function=len |
|
) |
|
|
|
for file, raw_doc in content.items(): |
|
splitted_text = markdown_splitter.split_text(raw_doc) |
|
for t in splitted_text: |
|
t.metadata["source"] = file |
|
md_splitted_docs.extend(splitted_text) |
|
|
|
docs = text_splitter.split_documents(md_splitted_docs) |
|
docs_as_dict = [doc.dict() for doc in docs] |
|
return docs_as_dict |
|
|
|
|
|
def get_embeddings(text: str) -> np.ndarray: |
|
"""Get embeddings.""" |
|
return np.array(EMBEDDING_MODEL.run(text)) |
|
|
|
|
|
def embed_documents( |
|
chunked_docs: dict[str, Any], key: str = "page_content" |
|
) -> tuple[dict[str, Any], np.ndarray]: |
|
"""Embed documents.""" |
|
all_embeddings = [] |
|
for doc in tqdm(chunked_docs): |
|
emb = get_embeddings(doc[key]) |
|
doc["embedding"] = emb |
|
all_embeddings.append(doc["embedding"]) |
|
full_embedding_mat = np.vstack(all_embeddings) |
|
return chunked_docs, full_embedding_mat |
|
|
|
|
|
def query_docs( |
|
query: str, |
|
docs: dict[str, Any], |
|
embedding_mat: np.ndarray, |
|
top_n: int = 10, |
|
key: str = "page_content", |
|
) -> tuple[list[int], list[str]]: |
|
"""Query documents.""" |
|
query_embedding = get_embeddings(query) |
|
scores = embedding_mat.dot(query_embedding) |
|
sorted_indices = np.argsort(scores)[::-1] |
|
top_n_indices = sorted_indices[:top_n] |
|
top_n_indices_rev = top_n_indices[::-1] |
|
returned_docs = [] |
|
for i in top_n_indices_rev: |
|
returned_docs.append(docs[i][key]) |
|
return top_n_indices_rev.tolist(), returned_docs |
|
|