Spaces:
Sleeping
Sleeping
File size: 6,492 Bytes
c4f1846 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
import time
import chromadb
from chromadb.utils import embedding_functions
from test.new import connect_to_llama
# from transformers import pipeline
import gradio as gr
import PyPDF2
import os
from chunkipy.text_chunker import split_by_sentences
import langid
from translate import Translator
chroma_client = chromadb.PersistentClient()
from test.llama import llama_local
working_dir = os.getcwd()
# checkpoint = f"{working_dir}/LaMini-T5-738M"
# model = pipeline('text2text-generation', model=checkpoint)
# input_prompt = """Answer the following question related reasoning answers from the following contexts that is given ..Don't generate answer from your data generate only from the provided contexts
# ..If the contexts doesn't provide an answer or isn't related to the question, respond with "there is no answer for the provided question"
# Question:"{}",
# Contexts:"{}"
# Answer:
# """
def detect_and_translate_query(query, context, dest_language='en'):
input_language, _ = langid.classify(query)
if isinstance(context, list):
context = " ".join(context)
translator = Translator(to_lang=dest_language, from_lang=input_language)
translated_query = translator.translate(query)
translated_context = translator.translate(context)
return translated_query, translated_context, input_language
def translate_response(response, source_language, dest_language):
translator = Translator(to_lang=source_language, from_lang=dest_language)
translated_response = translator.translate(response)
print("translate_response "+str(translate_response))
return translated_response
def create_multiple_db(path,collection,working_dir):
filelist = os.listdir(path)
print(filelist)
data_pdfs = []
metadata_buff=[]
for file_n in filelist:
with open(file_n, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
meta_data=dict(pdf_reader.metadata)
print("De elmeta data before: ",meta_data)
meta_data.update({"/Title":file_n})
print("De elmeta data after: ", meta_data)
metadata_buff.append(meta_data)
data = ""
for page_num in range(len(pdf_reader.pages)):
data += pdf_reader.pages[page_num].extract_text()
chunk = split_by_sentences(data)
for i, chunks in enumerate(chunk):
print(f"chunks{i}:", chunks)
data_pdfs.append(chunk)
file.close()
os.chdir(working_dir)
print(metadata_buff,"\n",len(metadata_buff))
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
i = 0
md_i=0
for data in data_pdfs:
print(data)
collection.add(
documents=data,
embeddings=sentence_transformer_ef(data),
ids=['id' + str(x + i) for x in range(len(data))],
metadatas=[metadata_buff[md_i]for i in range(len(data))]
)
md_i+=1
i += len(data)
return "done"
def architecture_with_chroma(data):
try:
data_dict = eval(data)
except:
return "please enter a valid json (dict) to process"
id = data_dict.get('id')
if id is None:
return "please enter an id to process on the prompt"
id = "mate" + str(id)
query = data_dict.get('query')
if query is None or query == "":
return "please enter a query to process"
collection = chroma_client.get_or_create_collection(name=id)
results = collection.query(
query_texts=[query],
n_results=5
)
context = results.get('documents')[0]
results_metadata = list(results.get("metadatas")[0])
results_documents = list(results.get("documents")[0])
for i in range(5):
results_documents[i] = f"In {results_metadata[i].get('/Title')}:" + results_documents[i]
for data in results_documents:
print(data)
print(context)
# generated_text = model(input_prompt.format(query+"? answer reasoning answers from the provided contexts only that is related and contains this information ", context), max_length=1024, do_sample=False)[0]['generated_text']
# print(input_prompt)
chroma_client.stop()
translated_query, translated_context, input_language = detect_and_translate_query(query, context)
print('translated_query '+str(translated_query))
print('translated_context '+str(translated_context))
results=connect_to_llama(query,results_documents)
# results=llama_local(query,results_documents)
translated_response = translate_response(results, input_language, dest_language='en')
return translated_response
# return results
# return generated_text
def create(data):
print(data)
print(type(data))
try:
dict=eval(data)
except:
return "please enter a valid json (dict) to process"
id=dict.get('id')
if id==None :
return "please enter an id to process on the prompt"
id="mate"+str(id)
if(not os.path.exists(id)):
return "sorry ,there is no directory for this client"
else:
chroma_client.delete_collection(name=id)
collection = chroma_client.get_or_create_collection(name=id)
print(os.chdir(id))
return create_multiple_db(os.getcwd(),collection,working_dir)+" making data for client"
def update(data):
print(data)
print(type(data))
try:
dict=eval(data)
except:
return "please enter a valid json (dict) to process"
id=dict.get('id')
if id==None :
return "please enter an id to process on the prompt"
id="mate"+str(dict.get('id'))
if(not os.path.exists(id)):
return "sorry ,there is no directory for this client"
else:
chroma_client.delete_collection(name=id)
collection=chroma_client.create_collection(name=id)
print(os.chdir(id))
return create_multiple_db(os.getcwd(),collection,working_dir)+"updating client embeddings"
iface = gr.Blocks()
with iface:
name = gr.Textbox(label="Name")
output = gr.Textbox(label="Output Box")
process_btn = gr.Button("process")
process_btn.click(fn=architecture_with_chroma, inputs=name, outputs=output, api_name="process")
create_btn = gr.Button("create")
create_btn.click(fn=create, inputs=name, outputs=output, api_name="create")
update_btn = gr.Button("update")
update_btn.click(fn=update, inputs=name, outputs=output, api_name="update")
iface.launch()
|