File size: 6,152 Bytes
b50f20a
 
 
 
 
 
049ff35
 
b50f20a
 
049ff35
b50f20a
049ff35
b50f20a
 
 
 
 
049ff35
 
b50f20a
 
049ff35
 
 
 
 
 
 
 
 
85bfd70
049ff35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f83f5f
c0fdffe
049ff35
 
 
 
85bfd70
 
 
049ff35
f9cf95c
049ff35
 
 
 
 
 
 
 
 
5f83f5f
 
 
049ff35
 
b50f20a
049ff35
b50f20a
85bfd70
 
 
 
049ff35
b50f20a
 
 
 
049ff35
b50f20a
049ff35
 
b50f20a
 
 
 
 
 
 
 
049ff35
b50f20a
 
 
 
 
 
 
 
 
049ff35
b50f20a
049ff35
b50f20a
 
 
 
 
 
 
85bfd70
b50f20a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34ccb7c
 
 
 
 
b50f20a
 
 
 
 
 
049ff35
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
from dotenv import load_dotenv

load_dotenv()

import pickle

import weave
from llama_index.core import PromptTemplate, VectorStoreIndex, get_response_synthesizer
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

data_dir = "data/raw_docs/documents.pkl"
with open(data_dir, "rb") as file:
    docs_files = pickle.load(file)

for i, doc in enumerate(docs_files[:], 1):
    doc.metadata["page"] = i

SYSTEM_PROMPT_TEMPLATE = """
Answer the following question about the newly released Llama 3 405 billion parameter model based on provided snippets from the research paper. 
Provide helpful, complete, and accurate answers to the question using only the information contained in these snippets.

Here are the relevant snippets from the Llama 3 405B model research paper:

<snippets>
{context_str}
</snippets>

To answer the question:

1. Carefully read and analyze the provided snippets.
2. Identify information that is directly relevant to the user's question.
3. Formulate a comprehensive answer based solely on the information in the snippets.
4. Do not include any information or claims that are not supported by the provided snippets.

Guidelines for your answer:

1. Be technical and informative, providing as much detail as the snippets allow.
2. If the snippets do not contain enough information to fully answer the question, state this clearly and provide what information you can based on the available snippets.
3. Do not make up or infer information beyond what is explicitly stated in the snippets.
4. If the question cannot be answered at all based on the provided snippets, state this clearly and explain why.
5. Use appropriate technical language and terminology as used in the snippets.
6. Cite the relevant sentences from the snippets and their page numbers to support your answer.
7. Answer in MFAQ format (Minimal Facts Answerable Question), providing the most concise and accurate response possible.
8. Use Markdown to format your response and include citation footnotes to indicate the snippets and the page number used to derive your answer.
9. Your answer must always contain footnotes citing the snippets used to derive the answer.

Here's an example of a question and an answer. You must use this as a template to format your response:

<example>
<question>
What was the main mix of the training data ? How much data was used to train the model ?
</question>

## Answer
The main mix of the training data for the Llama 3 405 billion parameter model is as follows:

- **General knowledge**: 50%
- **Mathematical and reasoning tokens**: 25%
- **Code tokens**: 17%
- **Multilingual tokens**: 8%[^1^].

Regarding the amount of data used to train the model, the snippets do not provide a specific total volume of data in terms of tokens or bytes. However, they do mention that the model was pre-trained on a large dataset containing knowledge until the end of 2023[^2^]. Additionally, the training process involved pre-training on 2.87 trillion tokens before further adjustments[^3^].

[^1^]: "Scaling Laws for Data Mix," page 6.
[^2^]: "Pre-Training Data," page 4.
[^3^]: "Initial Pre-Training," page 14.

</example>

Remember, your role is to accurately convey the information from the research paper snippets, not to speculate or provide information from other sources.

<question>
{query_str}
</question>

Answer:
"""


class SimpleRAGPipeline(weave.Model):
    chat_llm: str = "gpt-4o"
    embedding_model: str = "text-embedding-3-small"
    temperature: float = 0.1
    similarity_top_k: int = 15
    chunk_size: int = 512
    chunk_overlap: int = 128
    prompt_template: str = SYSTEM_PROMPT_TEMPLATE
    query_engine: RetrieverQueryEngine = None

    def _get_llm(self):
        return OpenAI(
            model=self.chat_llm,
            temperature=self.temperature,
            max_tokens=4096,
        )

    def _get_embedding_model(self):
        return OpenAIEmbedding(model=self.embedding_model)

    def _get_text_qa_template(self):
        return PromptTemplate(self.prompt_template)

    def _load_documents_and_chunk(self, documents: list):
        parser = MarkdownNodeParser()
        nodes = parser.get_nodes_from_documents(documents)
        return nodes

    def _create_vector_index(self, nodes):
        index = VectorStoreIndex(
            nodes,
            embed_model=self._get_embedding_model(),
            show_progress=True,
            insert_batch_size=512,
        )

        return index

    def _get_retriever(self, index):
        retriever = VectorIndexRetriever(
            index=index,
            similarity_top_k=self.similarity_top_k,
        )
        return retriever

    def _get_response_synthesizer(self):
        llm = self._get_llm()
        response_synthesizer = get_response_synthesizer(
            llm=llm,
            response_mode="compact",
            text_qa_template=self._get_text_qa_template(),
            streaming=True,
        )
        return response_synthesizer

    def build_query_engine(self):
        nodes = self._load_documents_and_chunk(docs_files)
        index = self._create_vector_index(nodes)
        retriever = self._get_retriever(index)
        response_synthesizer = self._get_response_synthesizer()

        self.query_engine = RetrieverQueryEngine(
            retriever=retriever,
            response_synthesizer=response_synthesizer,
        )

    @weave.op()
    def predict(self, question: str):
        response = self.query_engine.query(question)
        return {
            "response": response,
            'call_id': weave.get_current_call().id,
            "url": weave.get_current_call().ui_url,
        }


if __name__ == "__main__":
    rag_pipeline = SimpleRAGPipeline()
    rag_pipeline.build_query_engine()

    response = rag_pipeline.predict(
        "How does the model perform in comparision to gpt4 model?"
    )
    for resp in response.response_gen:
        print(resp, end="")