Spaces:
Running
Running
File size: 6,031 Bytes
203f582 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import chromadb
from chromadb.utils import embedding_functions
from .models import Param
import os
import torch
from openai import OpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from itertools import islice, zip_longest
import re
# model_id = "mistralai/Mistral-7B-Instruct-v0.2"
model_id = "Yugo60-GPT-GGUF.Q4_K_M.gguf"
#outputs = model.generate(**inputs, max_new_tokens=20)
#print(tokenizer.decode(outputs[0], skip_special_tokens=True))
if not torch.backends.mps.is_available():
if not torch.backends.mps.is_built():
print("MPS not available because the current PyTorch install was not "
"built with MPS enabled.")
else:
print("MPS not available because the current MacOS version is not 12.3+ "
"and/or you do not have an MPS-enabled device on this machine.")
else:
torch.set_default_device("mps")
model = ""
CHROMA_DATA_PATH = "/Users/zoranpopovic/uchat/chroma_data/"
EMBED_MODEL = "all-MiniLM-L6-v2"
# NousResearch/Hermes-2-Pro-Mistral-7B
# distilbert-base-multilingual-case
# paraphrase-multilingual-MiniLM-L12-v2d
COLLECTION_NAME = "chroma_data"
PDF_PATH = "./PDF/uputstvo_uz_eon_smart_box-1.pdf"
PDF_PATH2 = "./PDF/uputstvo_uz_eon_smart_aplikaciju-1.pdf"
CHUNK_SIZE = 800
CHUNK_OVERLAP = 50
max_results = 3
min_len = 40
min_distance = 0.35
max_distance = 0.6
temperature = 0.55
max_tokens=3072
top_p=0.8
frequency_penalty=0.0
presence_penalty=0.15
DEBUG = True
system_sr = "Zoveš se U-Chat AI asistent i pomažeš korisniku usluga kompanije United Group. Korisnik postavlja pitanje ili problem, upareno sa dodatnima saznanjima. Na osnovu toga napiši korisniku kratak i ljubazan odgovor koji kompletira njegov zahtev ili mu daje odgovor na pitanje. "
# " Ako ne znaš odgovor, reci da ne znaš, ne izmišljaj ga."
system_sr += "Usluge kompanije United Group uključuju i kablovsku mrežu za digitalnu televiziju, pristup internetu, uređaj EON SMART BOX za TV sadržaj, kao i fiksnu telefoniju."
system = {'srpski': system_sr, 'hrvatski': "", 'slovenački': "", 'makedonski': ""}
ctxpre = ""
msg_content = {'srpski': "- Dodatna saznanja su: ", 'hrvatski': "", 'slovenački': "", 'makedonski': ""}
max_conv = 3
try:
edit_all = Param.objects.all()
for edit in edit_all:
system[edit.jezik] = edit.system
ctxpre = edit.ctxpre
msg_content[edit.jezik] = edit.msg_content
min_len = edit.min_len
CHUNK_SIZE = edit.CHUNK_SIZE
CHUNK_OVERLAP = edit.CHUNK_OVERLAP
max_results = edit.max_results
EMBED_MODEL = edit.EMBED_MODEL
model_id = edit.model_id
min_distance = edit.min_distance
max_distance = edit.max_distance
max_conv = edit.max_conv
temperature = edit.temperature
top_p = edit.top_p
max_tokens = edit.max_tokens
presence_penalty = edit.presence_penalty
frequency_penalty = edit.frequency_penalty
DEBUG = edit.DEBUG
except:
pass
def load_and_split_document(pdf_path):
loader = PyPDFLoader(pdf_path)
print('Loaded: ' + pdf_path)
return loader.load_and_split()
def split_text_into_chunks(pages, chunk_size, chunk_overlap):
n = -1
for page in range(len(pages)): pages[page].page_content = re.sub(r'\s+'," ", pages[page].page_content.replace(". .","").replace(r'\n','.')).replace('..','')
for p in range(len(pages)):
if len(pages[p].page_content)<min_len:
if n<0: n = p
else:
if n>=0:
pages[n]=pages[p]; n += 1
if n>0: pages = pages[:n-1]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
return text_splitter.split_documents(pages)
def batched(iterable, n):
it = iter(iterable)
while True:
batch = list(islice(it, n))
if not batch:
return
yield batch
#client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)
#client.allow_reset = True
#client.delete_collection(COLLECTION_NAME)
oc = OpenAI(base_url="http://localhost:4891/v1", api_key="not-needed")
chroma_client = chromadb.PersistentClient(CHROMA_DATA_PATH)
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
model_name=EMBED_MODEL
)
collection = chroma_client.get_or_create_collection(
name="chroma_data",
embedding_function=embedding_func,
metadata={"hnsw:space": "cosine"},
)
last = collection.count()
def update_collection(docs, last, jezik):
state = -2
used =[]
for g in docs[0::2]:
state += 2
documents=docs[state+1][0]
bot.uchat.collection.add(
documents=documents,
ids=[f"id{last+i}" for i in range(len(documents))],
metadatas=[{"state": g, "next": g, "used": False, "source": 'None', "page": -1, "lang": jezik } for i in range(len(documents)) ]
)
last += len(documents)
if (len(docs[state+1])>1):
for n in docs[state+1][1:]:
bot.uchat.collection.add(
documents=n[1:],
ids=[f"id{last+i-1}" for i in range(1,len(n))],
metadatas=[{"state": g, "next": n[0], "used": False, "source": 'None', "page": -1, "lang": jezik } for i in range(1,len(n)) ]
)
for i in range(1,len(n)): used += [0]
last += len(n)-1
return last
#docus = load_and_split_document(PDF_PATH) + load_and_split_document(PDF_PATH2)
def load_docs(path, jezik):
docus = load_and_split_document(path)
pages = split_text_into_chunks(docus, CHUNK_SIZE, CHUNK_OVERLAP)
document_indices = list(range(bot.uchat.last, bot.uchat.last+len(pages)))
for batch in batched(document_indices, 66):
bot.uchat.collection.add(
ids=[f"id{last+batch[i]}" for i in range(len(batch))],
documents=[pages[i].page_content for i in batch],
metadatas=[dict(dict(dict(dict(pages[i].metadata, used=False), next='None'), state='None'), lang=jezik) for i in batch],
)
last += len(batch)
return last
|