import transformers import re from transformers import AutoConfig, AutoTokenizer, AutoModel, AutoModelForCausalLM from vllm import LLM, SamplingParams import torch import gradio as gr import json import os import shutil import requests import chromadb import pandas as pd from chromadb.config import Settings from chromadb.utils import embedding_functions model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation embeddings = np.load("embeddings_with_api.npy") embeddings_data = pd.read_json("embeddings_tchap.json") embeddings_text = embeddings_data["text_with_context"].tolist() # Define the device #device = "cuda" if torch.cuda.is_available() else "cpu" #Define variables temperature=0.2 max_new_tokens=1000 top_p=0.92 repetition_penalty=1.7 #model_name = "Pclanglais/Tchap" #llm = LLM(model_name, max_model_len=4096) #Vector search over the database def vector_search(sentence_query): query_embedding = model.encode(sentence_query, batch_size=12, max_length=256, # If you don't need such a long length, you can set a smaller value to speed up the encoding process. )['dense_vecs'] # Reshape the query embedding to fit the cosine_similarity function requirements query_embedding_reshaped = query_embedding.reshape(1, -1) # Compute cosine similarities similarities = cosine_similarity(query_embedding_reshaped, embeddings) # Find the index of the closest document (highest similarity) closest_doc_index = np.argmax(similarities) # Closest document's embedding closest_doc_embedding = sentences_1[closest_doc_index] return closest_doc_embedding class StopOnTokens(StoppingCriteria): def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: stop_ids = [29, 0] for stop_id in stop_ids: if input_ids[0][-1] == stop_id: return True return False def predict(message, history): text = vector_search(message) message = message + "\n\n### Source ###\n" history_transformer_format = history + [[message, ""]] stop = StopOnTokens() messages = "".join(["".join(["\n:"+item[0], "\n:"+item[1]]) for item in history_transformer_format]) return messages def predict_alt(message, history): history_transformer_format = history + [[message, ""]] stop = StopOnTokens() messages = "".join(["".join(["\n:"+item[0], "\n:"+item[1]]) for item in history_transformer_format]) model_inputs = tokenizer([messages], return_tensors="pt").to("cuda") streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( model_inputs, streamer=streamer, max_new_tokens=1024, do_sample=True, top_p=0.95, top_k=1000, temperature=1.0, num_beams=1, stopping_criteria=StoppingCriteriaList([stop]) ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() partial_message = "" for new_token in streamer: if new_token != '<': partial_message += new_token yield partial_message # Define the Gradio interface title = "Tchap" description = "Le chatbot du service public" examples = [ [ "Qui peut bénéficier de l'AIP?", # user_message 0.7 # temperature ] ] demo = gr.Blocks() with gr.Blocks(theme='JohnSmith9982/small_and_pretty', css=css) as demo: gr.HTML("""

Albert-Tchap

""") gr.ChatInterface(predict).launch() if __name__ == "__main__": demo.queue().launch()