Phi3-ORPO

Paused

App Files Files Community

justinj92 commited on May 7

Commit

9f02f73

•

1 Parent(s): 7b4f74e

Update app.py

Browse files

Files changed (1) hide show

app.py +284 -190

app.py CHANGED Viewed

@@ -1,261 +1,355 @@
-import gradio as gr
-import torch
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    TextIteratorStreamer,
-    pipeline
-)
-import os
-from threading import Thread
-import spaces
-import time
-import langchain
-import os
-import glob
-import gc
-# loaders
-from langchain.document_loaders import PyPDFLoader, DirectoryLoader
-# splits
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-# prompts
-from langchain import PromptTemplate
-# vector stores
-from langchain_community.vectorstores import FAISS
-# models
-from langchain.llms import HuggingFacePipeline
-from langchain.embeddings import HuggingFaceInstructEmbeddings
-# retrievers
-from langchain.chains import RetrievalQA
-import subprocess
-subprocess.run(
-    "pip install flash-attn --no-build-isolation",
-    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
-    shell=True,
-)
-class CFG:
-    DEBUG = False
-    ### LLM
-    model_name = 'justinj92/phi3-orpo'
-    temperature = 0.7
-    top_p = 0.90
-    repetition_penalty = 1.15
-    max_len = 8192
-    max_new_tokens = 512
-    ### splitting
-    split_chunk_size = 800
-    split_overlap = 400
-    ### embeddings
-    embeddings_model_repo = 'BAAI/bge-base-en-v1.5'
-    ### similar passages
-    k = 6
-    ### paths
-    PDFs_path = './data'
-    Embeddings_path =  './embeddings/input'
-    Output_folder = './ml-papers-vector'
-loader = DirectoryLoader(CFG.PDFs_path, glob="*.pdf", loader_cls=PyPDFLoader)
-documents = loader.load()
-text_splitter = RecursiveCharacterTextSplitter(chunk_size = CFG.split_chunk_size, chunk_overlap = CFG.split_overlap)
-texts = text_splitter.split_documents(documents)
-if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):
-    embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
-    vectordb = FAISS.from_documents(documents=texts, embedding=embeddings)
-    vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers")
-embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
-vectordb = FAISS.load_local(CFG.Output_folder + '/faiss_index_ml_papers', embeddings, allow_dangerous_deserialization=True)
-def build_model(model_repo = CFG.model_name):
-    tokenizer = AutoTokenizer.from_pretrained(model_repo)
-    model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
-    if torch.cuda.is_available():
-      device = torch.device("cuda")
-      print(f"Using GPU: {torch.cuda.get_device_name(device)}")
-    else:
-       device = torch.device("cpu")
-       print("Using CPU")
-    device = torch.device("cuda")
-    model = model.to(device)
-    return tokenizer, model
-tok, model = build_model(model_repo = CFG.model_name)
-terminators = [
-    tok.eos_token_id,
-    32007,
-    32011,
-    32001,
-    32000
-]
-pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
-llm = HuggingFacePipeline(pipeline = pipe)
-prompt_template = """
-<|system|>
-You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).
-You are given some extracted parts from machine learning papers along with a question.
-If you don't know the answer, just say "I don't know." Don't try to make up an answer.
-It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.
-Use only the following pieces of context to answer the question at the end.
-<|end|>
-<|user|>
-Context: {context}
-Question is below. Remember to answer in the same language:
-Question: {question}
-<|end|>
-<|assistant|>
-"""
-PROMPT = PromptTemplate(
-    template = prompt_template,
-    input_variables = ["context", "question"]
-)
-retriever = vectordb.as_retriever(
-    search_type = "similarity",
-    search_kwargs = {"k": CFG.k}
-)
-qa_chain = RetrievalQA.from_chain_type(
-    llm = llm,
-    chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
-    retriever = retriever,
-    chain_type_kwargs = {"prompt": PROMPT},
-    return_source_documents = True,
-    verbose = False
-)
-def wrap_text_preserve_newlines(text, width=1500):
-    # Split the input text into lines based on newline characters
-    lines = text.split('\n')
-    # Wrap each line individually
-    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
-    # Join the wrapped lines back together using newline characters
-    wrapped_text = '\n'.join(wrapped_lines)
-    return wrapped_text
-def process_llm_response(llm_response):
-    ans = wrap_text_preserve_newlines(llm_response['result'])
-    sources_used = ' \n'.join(
-        [
-            source.metadata['source'].split('/')[-1][:-4]
-            + ' - page: '
-            + str(source.metadata['page'])
-            for source in llm_response['source_documents']
-        ]
-    )
-    ans = ans + '\n\nSources: \n' + sources_used
-    ### return only the text after the pattern
-    pattern = "<|assistant|>"
-    index = ans.find(pattern)
-    if index != -1:
-        ans = ans[index + len(pattern):]
-    return ans.strip()
-@spaces.GPU
-def llm_ans(message, history):
-    llm_response = qa_chain.invoke(message)
-    ans = process_llm_response(llm_response)
-    return ans
-# @spaces.GPU(duration=60)
-# def chat(message, history, temperature, do_sample, max_tokens):
-#     chat = [{"role": "system", "content": "You are ORPO Tuned Phi Beast. Answer all questions in the most helpful way. No yapping."}]
-#     for item in history:
-#         chat.append({"role": "user", "content": item[0]})
-#         if item[1] is not None:
-#             chat.append({"role": "assistant", "content": item[1]})
-#     chat.append({"role": "user", "content": message})
-#     messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
-#     model_inputs = tok([messages], return_tensors="pt").to(device)
-#     streamer = TextIteratorStreamer(
-#         tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
-#     )
-#     generate_kwargs = dict(
-#         model_inputs,
-#         streamer=streamer,
-#         max_new_tokens=max_tokens,
-#         do_sample=True,
-#         temperature=temperature,
-#         eos_token_id=terminators,
-#     )
-#     if temperature == 0:
-#         generate_kwargs["do_sample"] = False
-#     t = Thread(target=model.generate, kwargs=generate_kwargs)
-#     t.start()
-#     partial_text = ""
-#     for new_text in streamer:
-#         partial_text += new_text
-#         yield partial_text
-#     yield partial_text
-demo = gr.ChatInterface(
-    fn=llm_ans,
-    examples=[["Write me a poem about Machine Learning."]],
-    # multimodal=False,
-    stop_btn="Stop Generation",
-    title="Chat With LLMs",
-    description="Now Running Phi3-ORPO",
-)
-demo.launch()

+# import gradio as gr
+# import torch
+# from transformers import (
+#     AutoModelForCausalLM,
+#     AutoTokenizer,
+#     TextIteratorStreamer,
+#     pipeline
+# )
+# import os
+# from threading import Thread
+# import spaces
+# import time
+# import langchain
+# import os
+# import glob
+# import gc
+# # loaders
+# from langchain.document_loaders import PyPDFLoader, DirectoryLoader
+# # splits
+# from langchain.text_splitter import RecursiveCharacterTextSplitter
+# # prompts
+# from langchain import PromptTemplate
+# # vector stores
+# from langchain_community.vectorstores import FAISS
+# # models
+# from langchain.llms import HuggingFacePipeline
+# from langchain.embeddings import HuggingFaceInstructEmbeddings
+# # retrievers
+# from langchain.chains import RetrievalQA
+# import subprocess
+# subprocess.run(
+#     "pip install flash-attn --no-build-isolation",
+#     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+#     shell=True,
+# )
+# class CFG:
+#     DEBUG = False
+#     ### LLM
+#     model_name = 'justinj92/phi3-orpo'
+#     temperature = 0.7
+#     top_p = 0.90
+#     repetition_penalty = 1.15
+#     max_len = 8192
+#     max_new_tokens = 512
+#     ### splitting
+#     split_chunk_size = 800
+#     split_overlap = 400
+#     ### embeddings
+#     embeddings_model_repo = 'BAAI/bge-base-en-v1.5'
+#     ### similar passages
+#     k = 6
+#     ### paths
+#     PDFs_path = './data'
+#     Embeddings_path =  './embeddings/input'
+#     Output_folder = './ml-papers-vector'
+# loader = DirectoryLoader(CFG.PDFs_path, glob="*.pdf", loader_cls=PyPDFLoader)
+# documents = loader.load()
+# text_splitter = RecursiveCharacterTextSplitter(chunk_size = CFG.split_chunk_size, chunk_overlap = CFG.split_overlap)
+# texts = text_splitter.split_documents(documents)
+# if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):
+#     embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
+#     vectordb = FAISS.from_documents(documents=texts, embedding=embeddings)
+#     vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers")
+# embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
+# vectordb = FAISS.load_local(CFG.Output_folder + '/faiss_index_ml_papers', embeddings, allow_dangerous_deserialization=True)
+# def build_model(model_repo = CFG.model_name):
+#     tokenizer = AutoTokenizer.from_pretrained(model_repo)
+#     model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
+#     if torch.cuda.is_available():
+#       device = torch.device("cuda")
+#       print(f"Using GPU: {torch.cuda.get_device_name(device)}")
+#     else:
+#        device = torch.device("cpu")
+#        print("Using CPU")
+#     device = torch.device("cuda")
+#     model = model.to(device)
+#     return tokenizer, model
+# tok, model = build_model(model_repo = CFG.model_name)
+# terminators = [
+#     tok.eos_token_id,
+#     32007,
+#     32011,
+#     32001,
+#     32000
+# ]
+# pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
+# llm = HuggingFacePipeline(pipeline = pipe)
+# prompt_template = """
+# <|system|>
+# You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).
+# You are given some extracted parts from machine learning papers along with a question.
+# If you don't know the answer, just say "I don't know." Don't try to make up an answer.
+# It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.
+# Use only the following pieces of context to answer the question at the end.
+# <|end|>
+# <|user|>
+# Context: {context}
+# Question is below. Remember to answer in the same language:
+# Question: {question}
+# <|end|>
+# <|assistant|>
+# """
+# PROMPT = PromptTemplate(
+#     template = prompt_template,
+#     input_variables = ["context", "question"]
+# )
+# retriever = vectordb.as_retriever(
+#     search_type = "similarity",
+#     search_kwargs = {"k": CFG.k}
+# )
+# qa_chain = RetrievalQA.from_chain_type(
+#     llm = llm,
+#     chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
+#     retriever = retriever,
+#     chain_type_kwargs = {"prompt": PROMPT},
+#     return_source_documents = True,
+#     verbose = False
+# )
+# def wrap_text_preserve_newlines(text, width=1500):
+#     # Split the input text into lines based on newline characters
+#     lines = text.split('\n')
+#     # Wrap each line individually
+#     wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
+#     # Join the wrapped lines back together using newline characters
+#     wrapped_text = '\n'.join(wrapped_lines)
+#     return wrapped_text
+# def process_llm_response(llm_response):
+#     ans = wrap_text_preserve_newlines(llm_response['result'])
+#     sources_used = ' \n'.join(
+#         [
+#             source.metadata['source'].split('/')[-1][:-4]
+#             + ' - page: '
+#             + str(source.metadata['page'])
+#             for source in llm_response['source_documents']
+#         ]
+#     )
+#     ans = ans + '\n\nSources: \n' + sources_used
+#     ### return only the text after the pattern
+#     pattern = "<|assistant|>"
+#     index = ans.find(pattern)
+#     if index != -1:
+#         ans = ans[index + len(pattern):]
+#     return ans.strip()
+# @spaces.GPU
+# def llm_ans(message, history):
+#     llm_response = qa_chain.invoke(message)
+#     ans = process_llm_response(llm_response)
+#     return ans
+# # @spaces.GPU(duration=60)
+# # def chat(message, history, temperature, do_sample, max_tokens):
+# #     chat = [{"role": "system", "content": "You are ORPO Tuned Phi Beast. Answer all questions in the most helpful way. No yapping."}]
+# #     for item in history:
+# #         chat.append({"role": "user", "content": item[0]})
+# #         if item[1] is not None:
+# #             chat.append({"role": "assistant", "content": item[1]})
+# #     chat.append({"role": "user", "content": message})
+# #     messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
+# #     model_inputs = tok([messages], return_tensors="pt").to(device)
+# #     streamer = TextIteratorStreamer(
+# #         tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
+# #     )
+# #     generate_kwargs = dict(
+# #         model_inputs,
+# #         streamer=streamer,
+# #         max_new_tokens=max_tokens,
+# #         do_sample=True,
+# #         temperature=temperature,
+# #         eos_token_id=terminators,
+# #     )
+# #     if temperature == 0:
+# #         generate_kwargs["do_sample"] = False
+# #     t = Thread(target=model.generate, kwargs=generate_kwargs)
+# #     t.start()
+# #     partial_text = ""
+# #     for new_text in streamer:
+# #         partial_text += new_text
+# #         yield partial_text
+# #     yield partial_text
+# demo = gr.ChatInterface(
+#     fn=llm_ans,
+#     examples=[["Write me a poem about Machine Learning."]],
+#     # multimodal=False,
+#     stop_btn="Stop Generation",
+#     title="Chat With LLMs",
+#     description="Now Running Phi3-ORPO",
+# )
+# demo.launch()
+import gradio as gr
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+import os
+from threading import Thread
+import langchain
+from langchain.document_loaders import DirectoryLoader, PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain import PromptTemplate
+from langchain_community.vectorstores import FAISS
+from langchain.llms import HuggingFacePipeline
+from langchain.embeddings import HuggingFaceInstructEmbeddings
+from langchain.chains import RetrievalQA
+import subprocess
+import textwrap
+# Installation command for specific libraries
+subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)
+class CFG:
+    DEBUG = False
+    model_name = 'justinj92/phi3-orpo'
+    temperature = 0.7
+    top_p = 0.90
+    repetition_penalty = 1.15
+    max_len = 8192
+    max_new_tokens = 512
+    split_chunk_size = 800
+    split_overlap = 400
+    embeddings_model_repo = 'BAAI/bge-base-en-v1.5'
+    k = 6
+    PDFs_path = './data'
+    Embeddings_path = './embeddings/input'
+    Output_folder = './ml-papers-vector'
+loader = DirectoryLoader(CFG.PDFs_path, glob="*.pdf", loader_cls=PyPDFLoader)
+documents = loader.load()
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.split_chunk_size, chunk_overlap=CFG.split_overlap)
+texts = text_splitter.split_documents(documents)
+if not os.path.exists(f"{CFG.Embeddings_path}/index.faiss"):
+    embeddings = HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
+    vectordb = FAISS.from_documents(documents=texts, embedding=embeddings)
+    vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers")
+embeddings = HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
+vectordb = FAISS.load_local(f"{CFG.Output_folder}/faiss_index_ml_papers", embeddings, allow_dangerous_deserialization=True)
+@spaces.GPU
+def build_model(model_repo=CFG.model_name):
+    tokenizer = AutoTokenizer.from_pretrained(model_repo)
+    model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    return tokenizer, model
+tok, model = build_model()
+terminators = [tok.eos_token_id, 32007, 32011, 32001, 32000]
+pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
+llm = HuggingFacePipeline(pipeline=pipe)
+prompt_template = """
+You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).
+You are given some extracted parts from machine learning papers along with a question.
+If you don't know the answer, just say "I don't know." Don't try to make up an answer.
+It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.
+Use only the following pieces of context to answer the question at the end.
+Context: {context}
+Question is below. Remember to answer in the same language:
+Question: {question}
+"""
+PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
+retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": CFG.k})
+qa_chain = RetrievalQA(llm=llm, retriever=retriever, prompt_template=PROMPT, return_source_documents=True, verbose=False)
+def process_llm_response(llm_response):
+    ans = textwrap.fill(llm_response['result'], width=1500)
+    sources_used = ' \n'.join([f"{source.metadata['source'].split('/')[-1][:-4]} - page: {str(source.metadata['page'])}" for source in llm_response['source_documents']])
+    return f"{ans}\n\nSources:\n{sources_used}"
+@gr.Interface(fn=process_llm_response, inputs=["text", "state"], outputs="text", title="Chat With LLMs", description="Now Running Phi3-ORPO")
+def llm_ans(message, history):
+    llm_response = qa_chain.invoke(message)
+    return process_llm_response(llm_response)
+llm_ans.launch()