# got some of the code from 
# https://diptimanrc.medium.com/rapid-q-a-on-multiple-pdfs-using-langchain-and-chromadb-as-local-disk-vector-store-60678328c0df
# https://stackoverflow.com/questions/76482987/chroma-database-embeddings-none-when-using-get
# https://docs.trychroma.com/embeddings/hugging-face?lang=py
# https://www.datacamp.com/tutorial/chromadb-tutorial-step-by-step-guide
# https://python.langchain.com/docs/modules/data_connection/retrievers/self_query
# https://python.langchain.com/docs/integrations/vectorstores/chroma#update-and-delete
# https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore

import chromadb

from langchain.text_splitter import CharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma

from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.embeddings import GPT4AllEmbeddings
from app.utils.utils import (
    generate_uuid    
)
from app.core.config import settings
# import dotenv
# import os

# dotenv.load_dotenv()
# persist_directory = os.getenv('VECTOR_DATABASE_LOCATION')
persist_directory = settings.VECTOR_DATABASE_LOCATION


def read_markdown_file(file_path: str) -> str:
    """
    Read a Markdown file and return its content as a single string.

    Args:
        file_path (str): The path to the Markdown file.

    Returns:
        str: The content of the Markdown file as a single string.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

def add_markdown_to_collection(
    markdown_file_location:str,
    collection_name:str,
    chunk_size:int,
    chunk_overlap:int,
) -> None:
    """
    Embeds markdown data to a given chroma db collection
    
    markdown_file_location (str): location of markdown file
    collection_name (str) : the collection where the documents will be added
    chunk_size (int) : size of the chunks to be embedded
    chunk_overlap (int) : the ammount of overlappping chunks

    """
    markdown_document = read_markdown_file(markdown_file_location)

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    md_header_splits = markdown_splitter.split_text(markdown_document)

    # MD splits
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        strip_headers=False,
    )


    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap
    )

    # Split
    splits = text_splitter.split_documents(md_header_splits)

    client = chromadb.PersistentClient(
         path=persist_directory,
        )


    # If the collection already exists, we just return it. This allows us to add more
    # data to an existing collection.
    collection = client.get_or_create_collection(
        name=collection_name,
        )
    
    # embedding_function = SentenceTransformerEmbeddings(
    #     #model_name=os.getenv("EMBEDDING_MODEL"),
    #     model_name=settings.EMBEDDING_MODEL
    #     )
    embedding_function = GPT4AllEmbeddings()

    documents_page_content:list = [i.page_content for i in splits]


    for i in range(0, len(splits)):
        data = splits[i]
        collection.add(
            ids=[generate_uuid()], # give each document a uuid
            documents=documents_page_content[i], # contents of document
            embeddings=embedding_function(documents_page_content[i]),
            metadatas=data.metadata,  # type: ignore
        )
        
def split_by_intervals(s: str, interval: int, overlapped: int = 0) -> list:
    """
    Split a string into intervals of a given length, with optional overlapping.
    
    Args:
        s: The input string.
        interval: The length of each interval.
        overlapped: The number of characters to overlap between intervals. Default is 0.
    
    Returns:
        A list of substrings, each containing 'interval' characters from the input string.
    """
    result = []
    for i in range(0, len(s), interval - overlapped):
        result.append(s[i:i + interval])
    return result


def add_pdf_to_vector_store(
    # vector_store:Chroma.from_documents,
    collection_name,
    pdf_file_location:str,
    text_chunk_size=1000,
    text_chunk_overlap=10,
    ) -> None:
    """
    ## Summary
    given the location of a pdf file this will chunk it's contents
    and store it the given vectorstore
    
    ## Arguments
    collection_name (str) : name of collection to store documents
    pdf_file_location (str) : location of pdf file
    
    ## Return 
    None
    """
    
    documents = []
    
    loader = PyPDFLoader(pdf_file_location)
    
    documents.extend(loader.load())
    
    split_docs:list[Document] = []
    
    for document in documents:
        sub_docs = split_by_intervals(
            document.page_content, 
            text_chunk_size,
            text_chunk_overlap
            )
        
        for sub_doc in sub_docs:
            loaded_doc = Document(sub_doc, metadata=document.metadata)
            split_docs.append(loaded_doc)
        
    
    client = chromadb.PersistentClient(
     path=persist_directory,
    )
    
    collection = client.get_or_create_collection(
    name=collection_name,
    )
    
    # embedding_function = SentenceTransformerEmbeddings(
    #     #model_name=os.getenv("EMBEDDING_MODEL"),
    #     model_name=settings.EMBEDDING_MODEL
    #     )
    embedding_function = GPT4AllEmbeddings()
    
    documents_page_content:list = [i.page_content for i in split_docs]
    
    
    for i in range(0, len(split_docs)):
        data = split_docs[i]
        
        collection.add(
            ids=[generate_uuid()], # give each document a uuid
            documents=documents_page_content[i], # contents of document
            embeddings=embedding_function(documents_page_content[i]),
            metadatas=data.metadata,  # type: ignore
        )
    
    
if __name__ == "__main__":
    
    collection_name="ArxivPapers"
    
    client = chromadb.PersistentClient(
     path=persist_directory,
    )
    
    # delete existing collection
    # client.delete_collection(
    # name=collection_name,
    # )
    
    collection = client.get_or_create_collection(
    name=collection_name,
    )
    
    pdf_file_location = "/workspaces/InnovationPathfinderAI/2212.02623.pdf"
    
    add_pdf_to_vector_store(
        collection_name="ArxivPapers",
        pdf_file_location=pdf_file_location,
    )
    
    pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
    
    add_pdf_to_vector_store(
        collection_name="ArxivPapers",
        pdf_file_location=pdf_file_location,
    )
    
    #create the cliient using Chroma's library
    client = chromadb.PersistentClient(
     path=persist_directory,
    )
    
    # This is an example collection name
    collection_name="ArxivPapers"
    
    # create the open-source embedding function
    # embedding_function = SentenceTransformerEmbeddings(
    #     #model_name=os.getenv("EMBEDDING_MODEL"),
    #     model_name=settings.EMBEDDING_MODEL
    #     )
    embedding_function = GPT4AllEmbeddings()
    
    #method of integrating Chroma and Langchain
    vector_db = Chroma(
    client=client, # client for Chroma
    collection_name=collection_name,
    embedding_function=embedding_function,
    )
    
    query = "ai" # your query 
    
    # using your Chromadb as a retriever for langchain
    retriever = vector_db.as_retriever()

    # returning a list of documents
    docs = retriever.get_relevant_documents(query)
    
    # pdf_file_location = "mydir/181000551.pdf"
    # pdf_file_location = "/workspaces/InnovationPathfinderAI/2402.17764.pdf"
    
    
    # example query using Chroma
    
    # results = collection.query(
    # query_texts=["benchmark"],
    # n_results=3,
    # include=['embeddings', 'documents', 'metadatas'],
    # )