import chromadb from chromadb.utils import embedding_functions from .models import Param import os import torch from openai import OpenAI from langchain_community.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from itertools import islice, zip_longest import re # model_id = "mistralai/Mistral-7B-Instruct-v0.2" model_id = "Yugo60-GPT-GGUF.Q4_K_M.gguf" #outputs = model.generate(**inputs, max_new_tokens=20) #print(tokenizer.decode(outputs[0], skip_special_tokens=True)) if not torch.backends.mps.is_available(): if not torch.backends.mps.is_built(): print("MPS not available because the current PyTorch install was not " "built with MPS enabled.") else: print("MPS not available because the current MacOS version is not 12.3+ " "and/or you do not have an MPS-enabled device on this machine.") else: torch.set_default_device("mps") model = "" CHROMA_DATA_PATH = "/Users/zoranpopovic/uchat/chroma_data/" EMBED_MODEL = "all-MiniLM-L6-v2" # NousResearch/Hermes-2-Pro-Mistral-7B # distilbert-base-multilingual-case # paraphrase-multilingual-MiniLM-L12-v2d COLLECTION_NAME = "chroma_data" PDF_PATH = "./PDF/uputstvo_uz_eon_smart_box-1.pdf" PDF_PATH2 = "./PDF/uputstvo_uz_eon_smart_aplikaciju-1.pdf" CHUNK_SIZE = 800 CHUNK_OVERLAP = 50 max_results = 3 min_len = 40 min_distance = 0.35 max_distance = 0.6 temperature = 0.55 max_tokens=3072 top_p=0.8 frequency_penalty=0.0 presence_penalty=0.15 DEBUG = True system_sr = "Zoveš se U-Chat AI asistent i pomažeš korisniku usluga kompanije United Group. Korisnik postavlja pitanje ili problem, upareno sa dodatnima saznanjima. Na osnovu toga napiši korisniku kratak i ljubazan odgovor koji kompletira njegov zahtev ili mu daje odgovor na pitanje. " # " Ako ne znaš odgovor, reci da ne znaš, ne izmišljaj ga." system_sr += "Usluge kompanije United Group uključuju i kablovsku mrežu za digitalnu televiziju, pristup internetu, uređaj EON SMART BOX za TV sadržaj, kao i fiksnu telefoniju." system = {'srpski': system_sr, 'hrvatski': "", 'slovenački': "", 'makedonski': ""} ctxpre = "" msg_content = {'srpski': "- Dodatna saznanja su: ", 'hrvatski': "", 'slovenački': "", 'makedonski': ""} max_conv = 3 try: edit_all = Param.objects.all() for edit in edit_all: system[edit.jezik] = edit.system ctxpre = edit.ctxpre msg_content[edit.jezik] = edit.msg_content min_len = edit.min_len CHUNK_SIZE = edit.CHUNK_SIZE CHUNK_OVERLAP = edit.CHUNK_OVERLAP max_results = edit.max_results EMBED_MODEL = edit.EMBED_MODEL model_id = edit.model_id min_distance = edit.min_distance max_distance = edit.max_distance max_conv = edit.max_conv temperature = edit.temperature top_p = edit.top_p max_tokens = edit.max_tokens presence_penalty = edit.presence_penalty frequency_penalty = edit.frequency_penalty DEBUG = edit.DEBUG except: pass def load_and_split_document(pdf_path): loader = PyPDFLoader(pdf_path) print('Loaded: ' + pdf_path) return loader.load_and_split() def split_text_into_chunks(pages, chunk_size, chunk_overlap): n = -1 for page in range(len(pages)): pages[page].page_content = re.sub(r'\s+'," ", pages[page].page_content.replace(". .","").replace(r'\n','.')).replace('..','') for p in range(len(pages)): if len(pages[p].page_content)=0: pages[n]=pages[p]; n += 1 if n>0: pages = pages[:n-1] text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) return text_splitter.split_documents(pages) def batched(iterable, n): it = iter(iterable) while True: batch = list(islice(it, n)) if not batch: return yield batch #client = chromadb.PersistentClient(path=CHROMA_DATA_PATH) #client.allow_reset = True #client.delete_collection(COLLECTION_NAME) oc = OpenAI(base_url="http://localhost:4891/v1", api_key="not-needed") chroma_client = chromadb.PersistentClient(CHROMA_DATA_PATH) embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction( model_name=EMBED_MODEL ) collection = chroma_client.get_or_create_collection( name="chroma_data", embedding_function=embedding_func, metadata={"hnsw:space": "cosine"}, ) last = collection.count() def update_collection(docs, last, jezik): state = -2 used =[] for g in docs[0::2]: state += 2 documents=docs[state+1][0] bot.uchat.collection.add( documents=documents, ids=[f"id{last+i}" for i in range(len(documents))], metadatas=[{"state": g, "next": g, "used": False, "source": 'None', "page": -1, "lang": jezik } for i in range(len(documents)) ] ) last += len(documents) if (len(docs[state+1])>1): for n in docs[state+1][1:]: bot.uchat.collection.add( documents=n[1:], ids=[f"id{last+i-1}" for i in range(1,len(n))], metadatas=[{"state": g, "next": n[0], "used": False, "source": 'None', "page": -1, "lang": jezik } for i in range(1,len(n)) ] ) for i in range(1,len(n)): used += [0] last += len(n)-1 return last #docus = load_and_split_document(PDF_PATH) + load_and_split_document(PDF_PATH2) def load_docs(path, jezik): docus = load_and_split_document(path) pages = split_text_into_chunks(docus, CHUNK_SIZE, CHUNK_OVERLAP) document_indices = list(range(bot.uchat.last, bot.uchat.last+len(pages))) for batch in batched(document_indices, 66): bot.uchat.collection.add( ids=[f"id{last+batch[i]}" for i in range(len(batch))], documents=[pages[i].page_content for i in batch], metadatas=[dict(dict(dict(dict(pages[i].metadata, used=False), next='None'), state='None'), lang=jezik) for i in batch], ) last += len(batch) return last