import time import chromadb from chromadb.utils import embedding_functions from test.new import connect_to_llama # from transformers import pipeline import gradio as gr import PyPDF2 import os from chunkipy.text_chunker import split_by_sentences import langid from translate import Translator chroma_client = chromadb.PersistentClient() from test.llama import llama_local working_dir = os.getcwd() # checkpoint = f"{working_dir}/LaMini-T5-738M" # model = pipeline('text2text-generation', model=checkpoint) # input_prompt = """Answer the following question related reasoning answers from the following contexts that is given ..Don't generate answer from your data generate only from the provided contexts # ..If the contexts doesn't provide an answer or isn't related to the question, respond with "there is no answer for the provided question" # Question:"{}", # Contexts:"{}" # Answer: # """ def detect_and_translate_query(query, context, dest_language='en'): input_language, _ = langid.classify(query) if isinstance(context, list): context = " ".join(context) translator = Translator(to_lang=dest_language, from_lang=input_language) translated_query = translator.translate(query) translated_context = translator.translate(context) return translated_query, translated_context, input_language def translate_response(response, source_language, dest_language): translator = Translator(to_lang=source_language, from_lang=dest_language) translated_response = translator.translate(response) print("translate_response "+str(translate_response)) return translated_response def create_multiple_db(path,collection,working_dir): filelist = os.listdir(path) print(filelist) data_pdfs = [] metadata_buff=[] for file_n in filelist: with open(file_n, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) meta_data=dict(pdf_reader.metadata) print("De elmeta data before: ",meta_data) meta_data.update({"/Title":file_n}) print("De elmeta data after: ", meta_data) metadata_buff.append(meta_data) data = "" for page_num in range(len(pdf_reader.pages)): data += pdf_reader.pages[page_num].extract_text() chunk = split_by_sentences(data) for i, chunks in enumerate(chunk): print(f"chunks{i}:", chunks) data_pdfs.append(chunk) file.close() os.chdir(working_dir) print(metadata_buff,"\n",len(metadata_buff)) sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2") i = 0 md_i=0 for data in data_pdfs: print(data) collection.add( documents=data, embeddings=sentence_transformer_ef(data), ids=['id' + str(x + i) for x in range(len(data))], metadatas=[metadata_buff[md_i]for i in range(len(data))] ) md_i+=1 i += len(data) return "done" def architecture_with_chroma(data): try: data_dict = eval(data) except: return "please enter a valid json (dict) to process" id = data_dict.get('id') if id is None: return "please enter an id to process on the prompt" id = "mate" + str(id) query = data_dict.get('query') if query is None or query == "": return "please enter a query to process" if(not os.path.exists(id)): return "sorry ,there is no directory for this client" collection = chroma_client.get_or_create_collection(name=id) results = collection.query( query_texts=[query], n_results=5 ) print(results," de elresults\n") context = results.get('documents')[0] results_metadata = list(results.get("metadatas")[0]) results_documents = list(results.get("documents")[0]) print(len(results_documents),"da el len bta3 elcontexts\n") print(results_documents) for i in range(5): results_documents[i] = f"In {results_metadata[i].get('/Title')}:" + results_documents[i] for data in results_documents: print(data) print(context) # generated_text = model(input_prompt.format(query+"? answer reasoning answers from the provided contexts only that is related and contains this information ", context), max_length=1024, do_sample=False)[0]['generated_text'] # print(input_prompt) chroma_client.stop() translated_query, translated_context, input_language = detect_and_translate_query(query, context) print('translated_query '+str(translated_query)) print('translated_context '+str(translated_context)) results=connect_to_llama(query,results_documents) # results=llama_local(query,results_documents) translated_response = translate_response(results, input_language, dest_language='en') return translated_response # return results # return generated_text def create(data): print(data) print(type(data)) try: dict=eval(data) except: return "please enter a valid json (dict) to process" id=dict.get('id') if id==None : return "please enter an id to process on the prompt" id="mate"+str(id) if(not os.path.exists(id)): return "sorry ,there is no directory for this client" else: collection = chroma_client.get_or_create_collection(name=id) print(os.chdir(id)) return create_multiple_db(os.getcwd(),collection,working_dir)+" making data for client" def update(data): print(data) print(type(data)) try: dict=eval(data) except: return "please enter a valid json (dict) to process" id=dict.get('id') if id==None : return "please enter an id to process on the prompt" id="mate"+str(dict.get('id')) if(not os.path.exists(id)): return "sorry ,there is no directory for this client" else: try: chroma_client.delete_collection(name=id) except error: pass collection=chroma_client.create_collection(name=id) print(os.chdir(id)) return create_multiple_db(os.getcwd(),collection,working_dir)+"updating client embeddings" iface = gr.Blocks() with iface: name = gr.Textbox(label="Name") output = gr.Textbox(label="Output Box") process_btn = gr.Button("process") process_btn.click(fn=architecture_with_chroma, inputs=name, outputs=output, api_name="process") create_btn = gr.Button("create") create_btn.click(fn=create, inputs=name, outputs=output, api_name="create") update_btn = gr.Button("update") update_btn.click(fn=update, inputs=name, outputs=output, api_name="update") iface.launch()