# Import modules import os import torch import gradio as gr from langchain_community.llms import HuggingFacePipeline from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings from langchain_community.vectorstores import Chroma from langchain_core.runnables import RunnablePassthrough from langchain_core.prompts import PromptTemplate from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig, pipeline HUGGINGFACE_ACCESS_TOKEN = os.environ["HUGGINGFACE_ACCESS_TOKEN"] base_model = "microsoft/phi-2" # Define the embedding function # I use the "all-MiniLM-L6-v2" model embedding_function = SentenceTransformerEmbeddings( model_name="all-MiniLM-L6-v2", model_kwargs={"device": "cuda"}, # Use the GPU ) tokenizer = AutoTokenizer.from_pretrained( base_model, use_fast=True, token=HUGGINGFACE_ACCESS_TOKEN, ) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right" bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=False, ) # Load the fine-tuned model by merging the base model and the adapter # (checkpointed at 1 epoch = 77 steps) adapter = "./results/checkpoint-77" model = AutoModelForCausalLM.from_pretrained( base_model, quantization_config=bnb_config, trust_remote_code=True, device_map={"": 0}, token=HUGGINGFACE_ACCESS_TOKEN, ) model_ft = PeftModel.from_pretrained(model, adapter) # For inference, use a text-generation pipeline # NOTE: you could get a warning such as "The model 'PeftModelForCausalLM' is not # supported for text-generation", but it's not a problem config = GenerationConfig(max_new_tokens=200) pipe = pipeline( "text-generation", model=model_ft, tokenizer=tokenizer, generation_config=config, framework="pt", ) """ NOTE: Although not strictly required by the assignment, considering that for Point 1 we created the embeddings of the emails and saved them in Chroma, it is trivial to add a simple RAG system. Basically, when a question is asked, some emails (or part of them) similar to the question are also sent to the model as context. """ # Load the saved database persist_directory = "./chroma_db" db = Chroma( persist_directory=persist_directory, embedding_function=embedding_function, ) # Setup a retriever so that we get the 2 most similar texts retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 2}) # Wrap the Hugging Face pipeline for langchain llm = HuggingFacePipeline(pipeline=pipe) # This is the template we will use for the text to submit to the model. # In place of {context} will be inserted the context sentences retrieved from # the RAG system, and in place of {question} will be inserted the question. template = """Instruct: You are an AI assistant for answering questions about the provided context. You are given the following extracted parts of a document database and a question. Provide a short answer. If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer. ======= {context} ======= Question: {question} Output:""" custom_rag_prompt = PromptTemplate.from_template(template) def format_docs(docs): # Separates retrieved texts with a double return character return "\n\n".join(doc.page_content for doc in docs) # RAG pipeline rag_chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | custom_rag_prompt | llm ) def get_answer(question): try: # Submit the question to the pipeline and extract the output answer = rag_chain.invoke(question).split("Output:")[1].strip() except Exception as e: answer = str(e) return answer # Define and launch the Gradio interface interface = gr.Interface( fn=get_answer, inputs=gr.Textbox(label="Enter your question"), outputs=gr.Textbox(label="Answer"), title="Enron QA", examples=[ ["What is the strategy in agricultural commodities training?"] ], ) interface.launch()