from langchain_community.document_loaders import WebBaseLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings.huggingface import HuggingFaceInferenceAPIEmbeddings from langchain_pinecone import PineconeVectorStore from gradio_client import Client import gradio as gr from langchain_community.document_loaders import PyPDFLoader import os from dotenv import load_dotenv from pinecone import Pinecone, ServerlessSpec load_dotenv() HF_TOKEN = os.environ["HF_TOKEN"] PINECONE_API_KEY = os.environ["PINECONE_API_KEY"] # Initialisation de Pinecone et du Modèle d'Embeddings # index_name = "db" pc = Pinecone( api_key=PINECONE_API_KEY ) # initialise une instance de Pinecone avec la clé API. embedder = HuggingFaceInferenceAPIEmbeddings( # initialise un modèle d'embeddings api_key=HF_TOKEN, model_name="mixedbread-ai/mxbai-embed-large-v1", ) index = "db" # users = { # "aymen": "admin", # "amin": "root", # "nour":"admin" # # Add more users as needed # } # Funtion that loads data from url and pdf # returns a list def load_data(url=None, description=None, pdf=None): data = [] if url != None: try: loader = WebBaseLoader( url, encoding="utf-8" ) # WebBaseLoader: charge et extrait le contenu textuel d'une page web loaded = loader.load() for page in loaded : data.append(page.page_content) except Exception as e: print("An error occurred while loading data from the URL:", e) if description != None: data.append(description) if pdf != None: loader = PyPDFLoader( pdf ) # PyPDFLoader: charge et divise un fichier PDF en pages pages = loader.load_and_split() for page in pages: data.append(page.page_content) return data # function to Split the loaded data def split_data( data, ): # divise les données en segments plus petits pour faciliter l'analyse et l'indexation # data = "\n".join(data) # Create a RecursiveCharacterTextSplitter instance text_splitter = RecursiveCharacterTextSplitter( chunk_size=512, chunk_overlap=60 ) # divise le texte en morceaux de taille spécifiée avec un chevauchement entre les morceaux pour éviter la perte de contexte # Split the text document into smaller chunks texts = text_splitter.create_documents(data) return texts # crée un index Pinecone pour un utilisateur s'il n'existe pas déjà def create_user_index(index_name): """Creates a Pinecone index with the username, validating the name first.""" existing_indexes = [ index.name for index in pc.list_indexes() ] # liste les index existants. if index_name in existing_indexes: # L'index existe déjà, ne le recréez pas return # Create the index (you might want to specify dimension and metric) pc.create_index( name=index_name, dimension=1024, metric="cosine", spec=ServerlessSpec(cloud="aws", region="us-east-1"), ) # embed: crée des embeddings pour les documents divisés et les stocke dans un magasin de vecteurs Pinecone. def embed(splited_docs, username): # Créez ou vérifiez l'index pour l'utilisateur create_user_index(index) # Créez une base de données vectorielle Pinecone à partir des documents divisés PineconeVectorStore.from_documents( # PineconeVectorStore.from_documents: crée et stocke des vecteurs pour les documents fournis. documents=splited_docs, index_name=index, embedding=embedder, namespace=username, ) # Créez un retrieveur # retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 3}) # return retriever # Récupération des Documents # retrieve documents from the dataset def retrieve(prompt, username): vectorstore = PineconeVectorStore.from_existing_index( # from_existing_index: initialise un magasin de vecteurs à partir d'un index existant. index_name=index, embedding=embedder, namespace=username ) retriever = vectorstore.as_retriever( search_type="mmr", search_kwargs={"k": 2} ) # as_retriever: crée un retrieveur pour interroger le magasin de vecteurs. retrieved_docs = retriever.invoke(prompt) return retrieved_docs def format_prompt(prompt, retrieved_documents, tone, marketing_technique, social_media): prompt = f"""You are an assistant for digital marketing You are given the extracted parts of a long document and a question. Provide a conversational answer. If you don't know the answer, just ignore the context. Question: \n{prompt}\n""" if tone != "Default": prompt += f"Tone {tone} \n" if marketing_technique != "Default": prompt += f"Marketing technique: {marketing_technique}\n" if social_media != "Default": prompt += f"Social media plateform: {social_media}\n" prompt += "Context:\n" for document in retrieved_documents: prompt += f"{document.page_content}\n" # prompt += """If you don't know the answer, just say "I do not know."Don't make up an answer.""" return prompt # Quelle est la capitale de la France ? based on the following context, # Context : # La France est un pays situé en Europe de l'Ouest. Sa capitale est Paris. # Paris est non seulement la capitale de la France, mais aussi la plus grande ville du pays. # based on the following context # basé sur le contexte suivant # If you don't know the answer, just say "I do not know."Don't make up an answer. def clear_history(history): return [] # function to Use a mistral llm via api hugging face space def ask_mistral(prompt): client = Client("hysts/mistral-7b") result = client.predict( message= prompt, max_new_tokens=1024, temperature=0.6, top_p=0.9, top_k=50, repetition_penalty=1.2, api_name="/chat" ) return result def inject_history(final_prompt, history): if len(history) > 0: final_prompt = ( final_prompt + "\n\nHistory : \n " ) for user, assistant in history: final_prompt = final_prompt + "USER : " + user + "\n" final_prompt = "ASSISTANT : " + assistant + "\n" return final_prompt else: return final_prompt # what is my name based on the following context . # context: # retreived documents: # and the following history of the conversation : # USER : my name is nour # ASSISTANT : hi nour # USER : what is my name ? def upload_user_data(username, url=None, description=None, pdf_file=None): data = load_data(url, description, pdf_file) splitted_data = split_data(data) embed(splitted_data, username) message = f"data has been uploaded successfully to {username}" return message def user_retrieve_and_generate( username, tone, marketing_technique, prompt, history, social_media ): # retrieve data from vector store retrieved_documents = retrieve(prompt, username) # format prompt formatted_prompt = format_prompt( prompt, retrieved_documents, tone, marketing_technique, social_media ) # inject history # formatted_prompt = inject_history(formatted_prompt, history) # ask mistral result = ask_mistral(formatted_prompt) # history.append([prompt,result]) new_history = history + [(prompt, result)] return new_history # def custom_auth(username, password): # if username in users and users[username] == password: # return True # return False upload_data = gr.Interface( fn=upload_user_data, inputs=[ gr.Textbox(label="username"), gr.Textbox(label="URL"), gr.Textbox(label="Description"), gr.File(label="PDF", type="filepath", file_count="single"), ], outputs=gr.Textbox(label="Output"), title="Upload Data", description="Upload your data to extract text and answer questions.", api_name="upload", ) def clear_prompt(prompt): return "" with gr.Blocks() as user_interface: gr.Markdown( value="""user interface to retreive and genarate text based on uploaded data.""", label=None, ) username = gr.Textbox(label="username") with gr.Accordion("Extra options ⚙️", open=False): tone = gr.Dropdown( ["Default", "neutral", "funny", "serious", "formal"], value="Default", label="tone of voice used in the replies", ) marketing_technique = gr.Radio( [ "Default", "Retargeting", "AIDA", "Promotion", "Testimonial", "FOMO", "Before and after", "Problem and solution", ], value="Default", label="marketing technique to be used in the replies", ) social_media = gr.Radio( ["Default", "instagram", "facebook", "twitter"], value="Default", label="social media platform to be used in the replies", ) chatbot = gr.Chatbot( height=450, label="Gradio ChatInterface", show_copy_button=True ) prompt = gr.Textbox(label="prompt") with gr.Row(): clear = gr.Button("🗑️Clear", variant="secondary") submit = gr.Button("✅Submit", variant="primary") submit.click( fn=user_retrieve_and_generate, inputs=[username, tone, marketing_technique, prompt, chatbot, social_media], outputs=[chatbot], api_name="generate", ).then(clear_prompt, inputs=prompt, outputs=prompt, show_api=False) prompt.submit( fn=user_retrieve_and_generate, inputs=[username, tone, marketing_technique, prompt, chatbot, social_media], outputs=[chatbot], api_name=False, ).then(clear_prompt, inputs=prompt, outputs=prompt, show_api=False) clear.click(fn=clear_history, inputs=chatbot, outputs=chatbot, show_api=False) demo = gr.TabbedInterface( [upload_data, user_interface], ["upload", "generate"], theme="upsatwal/mlsc_tiet", ) if __name__ == "__main__": demo.launch( debug=True # ,auth= custom_auth ,auth_message="Enter your username and password" )