|
from fastapi import FastAPI |
|
|
|
|
|
from txtai.embeddings import Embeddings |
|
from txtai.pipeline import Extractor |
|
from langchain.document_loaders import WebBaseLoader |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
import pandas as pd |
|
import sqlite3 |
|
import os |
|
|
|
|
|
|
|
app = FastAPI(docs_url="/") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_embeddings( |
|
domain: str = "", |
|
db_present: bool = True, |
|
path: str = "sentence-transformers/all-MiniLM-L6-v2", |
|
index_name: str = "index", |
|
): |
|
|
|
embeddings = Embeddings({"path": path, "content": True}) |
|
|
|
|
|
if not db_present: |
|
return embeddings |
|
else: |
|
if domain == "": |
|
embeddings.load(index_name) |
|
else: |
|
print(3) |
|
embeddings.load(f"{index_name}/{domain}") |
|
return embeddings |
|
|
|
|
|
def _check_if_db_exists(db_path: str) -> bool: |
|
return os.path.exists(db_path) |
|
|
|
|
|
def _text_splitter(doc): |
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=500, |
|
chunk_overlap=50, |
|
length_function=len, |
|
) |
|
return text_splitter.transform_documents(doc) |
|
|
|
|
|
def _load_docs(path: str): |
|
load_doc = WebBaseLoader(path).load() |
|
doc = _text_splitter(load_doc) |
|
return doc |
|
|
|
|
|
def _stream(dataset, limit, index: int = 0): |
|
for row in dataset: |
|
yield (index, row.page_content, None) |
|
index += 1 |
|
|
|
if index >= limit: |
|
break |
|
|
|
|
|
def _max_index_id(path): |
|
db = sqlite3.connect(path) |
|
|
|
table = "sections" |
|
df = pd.read_sql_query(f"select * from {table}", db) |
|
return {"max_index": df["indexid"].max()} |
|
|
|
|
|
def _upsert_docs(doc, embeddings, vector_doc_path: str, db_present: bool): |
|
print(vector_doc_path) |
|
if db_present: |
|
print(1) |
|
max_index = _max_index_id(f"{vector_doc_path}/documents") |
|
print(max_index) |
|
embeddings.upsert(_stream(doc, 500, max_index["max_index"])) |
|
print("Embeddings done!!") |
|
embeddings.save(vector_doc_path) |
|
print("Embeddings done - 1!!") |
|
else: |
|
print(2) |
|
embeddings.index(_stream(doc, 500, 0)) |
|
embeddings.save(vector_doc_path) |
|
max_index = _max_index_id(f"{vector_doc_path}/documents") |
|
print(max_index) |
|
|
|
|
|
|
|
return max_index |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.get("/index/{domain}/") |
|
def get_domain_file_path(domain: str, file_path: str): |
|
print(domain, file_path) |
|
print(os.getcwd()) |
|
bool_value = _check_if_db_exists(db_path=f"{os.getcwd()}\index\{domain}\documents") |
|
print(bool_value) |
|
if bool_value: |
|
embeddings = load_embeddings(domain=domain, db_present=bool_value) |
|
print(embeddings) |
|
doc = _load_docs(file_path) |
|
max_index = _upsert_docs( |
|
doc=doc, |
|
embeddings=embeddings, |
|
vector_doc_path=f"index/{domain}", |
|
db_present=bool_value, |
|
) |
|
|
|
else: |
|
embeddings = load_embeddings(domain=domain, db_present=bool_value) |
|
doc = _load_docs(file_path) |
|
max_index = _upsert_docs( |
|
doc=doc, |
|
embeddings=embeddings, |
|
vector_doc_path=f"index/{domain}", |
|
db_present=bool_value, |
|
) |
|
|
|
return "Executed Successfully!!" |
|
|