File size: 2,656 Bytes
9ba0e23
 
 
 
9f02f73
dcdf5b3
2a04ac7
13de349
2a04ac7
da076c6
 
2a04ac7
 
 
 
 
9f02f73
1e9b57f
9ba0e23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1e9b57f
9ba0e23
 
 
 
1e9b57f
9ba0e23
 
 
 
 
1e9b57f
9ba0e23
dcdf5b3
9ba0e23
63b82b4
9ba0e23
 
 
 
1e9b57f
9ba0e23
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext,SummaryIndex
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import torch
import spaces
import subprocess
import sys

subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'git+https://github.com/huggingface/transformers', '-U'])

subprocess.run(
     "pip install flash-attn --no-build-isolation",
     env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
     shell=True,
)


documents = SimpleDirectoryReader("./data").load_data()
# vector_index = VectorStoreIndex.from_documents(documents)
summary_index = SummaryIndex.from_documents(documents)

def messages_to_prompt(messages):
    prompt = ""
    system_found = False
    for message in messages:
        if message.role == "system":
            prompt += f"<|system|>\n{message.content}<|end|>\n"
            system_found = True
        elif message.role == "user":
            prompt += f"<|user|>\n{message.content}<|end|>\n"
        elif message.role == "assistant":
            prompt += f"<|assistant|>\n{message.content}<|end|>\n"
        else:
            prompt += f"<|user|>\n{message.content}<|end|>\n"

    # trailing prompt
    prompt += "<|assistant|>\n"

    if not system_found:
        prompt = (
            "<|system|>\nYou are a helpful AI research assistant built by Justin. You only answer from the context provided.<|end|>\n" + prompt
        )

    return prompt

llm = HuggingFaceLLM(
    model_name="justinj92/phi3-orpo",
    model_kwargs={
        "trust_remote_code": True,
        "torch_dtype": torch.bfloat16
    },
    generate_kwargs={"do_sample": True, "temperature": 0.7},
    tokenizer_name="justinj92/phi3-orpo",
    query_wrapper_prompt=(
        "<|system|>\n"
        "You are a helpful AI research assistant built by Justin. You only answer from the context provided.<|end|>\n"
        "<|user|>\n"
        "{query_str}<|end|>\n"
        "<|assistant|>\n"
    ),
    messages_to_prompt=messages_to_prompt,
    is_chat_model=True,
)

Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=Settings.embed_model
)

index = VectorStoreIndex.from_documents(documents, service_context=service_context)

query_engine = index.as_query_engine()

@spaces.GPU
def predict(input, history):
  response = query_engine.query(input)
  return str(response)

import gradio as gr
gr.ChatInterface(predict).launch(share=True)