gufett0 commited on
Commit
f0608de
1 Parent(s): cd97913

vectostoreindex

Browse files
Files changed (5) hide show
  1. .gitignore +0 -2
  2. app-last.py +244 -0
  3. app.py +16 -231
  4. backend.py +1 -1
  5. interface.py +105 -0
.gitignore CHANGED
@@ -1,5 +1,3 @@
1
  /myenv
2
  __pycache__/
3
  appcompleta.py
4
- appLlama.py
5
- interface.py
 
1
  /myenv
2
  __pycache__/
3
  appcompleta.py
 
 
app-last.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import spaces
3
+ from threading import Thread
4
+ from typing import Iterator
5
+ from backend2 import load_documents, prepare_documents, get_context_sources
6
+ import gradio as gr
7
+ import torch
8
+ from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer
9
+ from huggingface_hub import login
10
+ from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, ChatPromptTemplate, PromptTemplate, load_index_from_storage, StorageContext
11
+ from llama_index.core.node_parser import SentenceSplitter
12
+ from llama_index.embeddings.instructor import InstructorEmbedding
13
+
14
+
15
+ huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
16
+ login(huggingface_token)
17
+
18
+ DESCRIPTION = """\
19
+ # La Chatbot degli Osservatori
20
+ """
21
+ MAX_MAX_NEW_TOKENS = 2048
22
+ DEFAULT_MAX_NEW_TOKENS = 1024
23
+ os.environ["MAX_INPUT_TOKEN_LENGTH"] = "4096" #"8192"
24
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH"))
25
+
26
+
27
+ # Force usage of CPU
28
+ #device = torch.device("cpu")
29
+
30
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
+
32
+ model_id = "google/gemma-2-2b-it"
33
+ model = AutoModelForCausalLM.from_pretrained(
34
+ model_id,
35
+ device_map="auto",
36
+ torch_dtype= torch.bfloat16 #torch.float16 if torch.cuda.is_available() else torch.float32,
37
+ )
38
+ tokenizer = GemmaTokenizerFast.from_pretrained(model_id)
39
+ #tokenizer = AutoTokenizer.from_pretrained(model_id)
40
+ tokenizer.use_default_system_prompt = False
41
+ model.config.sliding_window = 4096
42
+ #model = model.to(device)
43
+ model.eval()
44
+
45
+ ###------####
46
+ # rag
47
+ documents_paths = {
48
+ 'blockchain': 'documents/blockchain',
49
+ 'metaverse': 'documents/metaverso',
50
+ 'payment': 'documents/payment'
51
+ }
52
+
53
+ global session_state
54
+ session_state = {"index": False,
55
+ "documents_loaded": False,
56
+ "document_db": None,
57
+ "original_message": None,
58
+ "clarification": False}
59
+
60
+ INSTRUCTION_1 = 'In italiano, chiedi sempre se la domanda si riferisce agli "Osservatori Blockchain", "Osservatori Payment" oppure "Osservatori Metaverse".'
61
+ INSTRUCTION_2 = 'Sei un assistente che risponde sempre in italiano alle domande basandosi solo sulle informazioni fornite nel contesto che ti darò. Se non trovi informazioni, rispondi "Puoi chiedere maggiori informazioni all\'ufficio di riferimento.". Se invece la domanda è completamente fuori contesto, non rispondere e rammenta il topic del contesto'
62
+
63
+
64
+ """# Reading documents from disk
65
+ docs = SimpleDirectoryReader(input_files=["data/blockchainprova.txt"]).load_data()
66
+ # Splitting the document into chunks with
67
+ # predefined size and overlap
68
+ parser = SentenceSplitter.from_defaults(
69
+ chunk_size=256, chunk_overlap=64, paragraph_separator="\n\n"
70
+ )
71
+ nodes = parser.get_nodes_from_documents(docs)"""
72
+
73
+
74
+ @spaces.GPU()
75
+ def generate(
76
+ message: str,
77
+ chat_history: list[tuple[str, str]],
78
+ max_new_tokens: int = 1024,
79
+ temperature: float = 0.6,
80
+ top_p: float = 0.9,
81
+ top_k: int = 50,
82
+ repetition_penalty: float = 1.2,
83
+ ) -> Iterator[str]:
84
+
85
+
86
+ global matched_path
87
+
88
+ conversation = []
89
+ for user, assistant in chat_history:
90
+ conversation.extend(
91
+ [
92
+ {"role": "user", "content": user},
93
+ {"role": "assistant", "content": assistant},
94
+ ]
95
+ )
96
+
97
+
98
+ if not session_state["index"]:
99
+
100
+ matched_path = None
101
+ words = message.lower()
102
+ for key, path in documents_paths.items():
103
+ if key in words:
104
+ matched_path = path
105
+ break
106
+ if matched_path:
107
+ documents = load_documents(matched_path)
108
+ DB = prepare_documents(documents)
109
+ context, sources = get_context_sources(message, DB)
110
+ print("*** sources ***", sources)
111
+ gr.Info("doc preparati con ", sources)
112
+
113
+ conversation.append({"role": "user", "content": f'Contesto: {context}\n\n Domanda: {message}. Rispondi in italiano'})
114
+
115
+ ######
116
+
117
+ """index = VectorStoreIndex(nodes)
118
+ # get retriver
119
+ retriever = index.as_retriever(similarity_top_k=3)
120
+ relevant_chunks = retriever.retrieve(message)
121
+ print(f"Found: {len(relevant_chunks)} relevant chunks")
122
+ for idx, chunk in enumerate(relevant_chunks):
123
+
124
+ info_message += f"{idx + 1}) {chunk.text[:64]}...\n"
125
+ print(info_message)
126
+ gr.Info(info_message)"""
127
+
128
+ session_state["index"] = True
129
+
130
+ else: ## CHIEDI CHIARIMENTO
131
+
132
+ conversation.append({"role": "user", "content": f"Domanda: {message} . Comando: {INSTRUCTION_1}" })
133
+ gr.Info("richiesta di chiarimento")
134
+ print("******** CONV1 ", conversation)
135
+
136
+
137
+
138
+ else:
139
+
140
+ documents = load_documents(matched_path)
141
+ DB = prepare_documents(documents)
142
+ context, sources = get_context_sources(message, DB)
143
+ gr.Info("contesto già indicizzato")
144
+ conversation.append({"role": "user", "content": f"{INSTRUCTION_2}"})
145
+ conversation.append({"role": "assistant", "content": "Ok."})
146
+ conversation.append({"role": "user", "content": f'Contesto: {context}\n\n Domanda: {message}. Rispondi in italiano'})
147
+
148
+ print("******** CONV2 ", conversation)
149
+
150
+
151
+
152
+ # Iterate model output
153
+ input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
154
+ if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
155
+ input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
156
+ gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
157
+ input_ids = input_ids.to(model.device)
158
+
159
+ streamer = TextIteratorStreamer(tokenizer, timeout=None, skip_prompt=True, skip_special_tokens=True)
160
+ generate_kwargs = dict(
161
+ {"input_ids": input_ids},
162
+ streamer=streamer,
163
+ max_new_tokens=max_new_tokens,
164
+ do_sample=True,
165
+ top_p=top_p,
166
+ top_k=top_k,
167
+ temperature=temperature,
168
+ num_beams=1,
169
+ repetition_penalty=repetition_penalty,
170
+ )
171
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
172
+ t.start()
173
+
174
+
175
+ outputs = []
176
+ for text in streamer:
177
+ outputs.append(text)
178
+ yield "".join(outputs)
179
+
180
+ if session_state["documents_loaded"]:
181
+ outputs.append(f"Fonti utilizzate: {sources}")
182
+ yield "".join(outputs)
183
+
184
+ #sources = []
185
+ print("debug - CHATHISTORY", chat_history)
186
+
187
+ chat_interface = gr.ChatInterface(
188
+ fn=generate,
189
+ additional_inputs=[
190
+ gr.Slider(
191
+ label="Max new tokens",
192
+ minimum=1,
193
+ maximum=MAX_MAX_NEW_TOKENS,
194
+ step=1,
195
+ value=DEFAULT_MAX_NEW_TOKENS,
196
+ ),
197
+ gr.Slider(
198
+ label="Temperature",
199
+ minimum=0.1,
200
+ maximum=4.0,
201
+ step=0.1,
202
+ value=0.6,
203
+ ),
204
+ gr.Slider(
205
+ label="Top-p (nucleus sampling)",
206
+ minimum=0.05,
207
+ maximum=1.0,
208
+ step=0.05,
209
+ value=0.9,
210
+ ),
211
+ gr.Slider(
212
+ label="Top-k",
213
+ minimum=1,
214
+ maximum=1000,
215
+ step=1,
216
+ value=50,
217
+ ),
218
+ gr.Slider(
219
+ label="Repetition penalty",
220
+ minimum=1.0,
221
+ maximum=2.0,
222
+ step=0.05,
223
+ value=1.2,
224
+ ),
225
+ ],
226
+ stop_btn=None,
227
+ examples=[
228
+ ["Ciao, in cosa puoi aiutarmi?"],
229
+ ["Ciao, in cosa consiste un piatto di spaghetti?"],
230
+ ["Ciao, quali sono le aziende che hanno iniziato ad integrare le stablecoins? Fammi un breve sommario."],
231
+ ["Spiegami la differenza tra mondi virtuali pubblici o privati"],
232
+ ["Trovami un esempio di progetto B2B"],
233
+ ["Quali sono le regole europee sui bonifici istantanei?"],
234
+ ],
235
+ cache_examples=False,
236
+ )
237
+
238
+ with gr.Blocks(css=".gradio-container {background-color: #B9D9EB}", fill_height=True) as demo:
239
+ gr.Markdown(DESCRIPTION, elem_classes="centered")
240
+ chat_interface.render()
241
+
242
+ if __name__ == "__main__":
243
+ #demo.queue(max_size=20).launch()
244
+ demo.launch()
app.py CHANGED
@@ -1,244 +1,29 @@
1
- import os
2
- import spaces
3
- from threading import Thread
4
- from typing import Iterator
5
- from backend2 import load_documents, prepare_documents, get_context_sources
6
  import gradio as gr
7
- import torch
8
- from transformers import AutoModelForCausalLM, GemmaTokenizerFast, TextIteratorStreamer
9
- from huggingface_hub import login
10
- from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader, ChatPromptTemplate, PromptTemplate, load_index_from_storage, StorageContext
11
- from llama_index.core.node_parser import SentenceSplitter
12
- from llama_index.embeddings.instructor import InstructorEmbedding
13
 
14
 
15
- huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
16
- login(huggingface_token)
17
-
18
  DESCRIPTION = """\
19
- # La Chatbot degli Osservatori
20
- """
21
- MAX_MAX_NEW_TOKENS = 2048
22
- DEFAULT_MAX_NEW_TOKENS = 1024
23
- os.environ["MAX_INPUT_TOKEN_LENGTH"] = "4096" #"8192"
24
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH"))
25
-
26
-
27
- # Force usage of CPU
28
- #device = torch.device("cpu")
29
-
30
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
-
32
- model_id = "google/gemma-2-2b-it"
33
- model = AutoModelForCausalLM.from_pretrained(
34
- model_id,
35
- device_map="auto",
36
- torch_dtype= torch.bfloat16 #torch.float16 if torch.cuda.is_available() else torch.float32,
37
- )
38
- tokenizer = GemmaTokenizerFast.from_pretrained(model_id)
39
- #tokenizer = AutoTokenizer.from_pretrained(model_id)
40
- tokenizer.use_default_system_prompt = False
41
- model.config.sliding_window = 4096
42
- #model = model.to(device)
43
- model.eval()
44
 
45
- ###------####
46
- # rag
47
- documents_paths = {
48
- 'blockchain': 'documents/blockchain',
49
- 'metaverse': 'documents/metaverso',
50
- 'payment': 'documents/payment'
51
- }
52
 
53
- global session_state
54
- session_state = {"index": False,
55
- "documents_loaded": False,
56
- "document_db": None,
57
- "original_message": None,
58
- "clarification": False}
59
-
60
- INSTRUCTION_1 = 'In italiano, chiedi sempre se la domanda si riferisce agli "Osservatori Blockchain", "Osservatori Payment" oppure "Osservatori Metaverse".'
61
- INSTRUCTION_2 = 'Sei un assistente che risponde sempre in italiano alle domande basandosi solo sulle informazioni fornite nel contesto che ti darò. Se non trovi informazioni, rispondi "Puoi chiedere maggiori informazioni all\'ufficio di riferimento.". Se invece la domanda è completamente fuori contesto, non rispondere e rammenta il topic del contesto'
62
-
63
-
64
- """# Reading documents from disk
65
- docs = SimpleDirectoryReader(input_files=["data/blockchainprova.txt"]).load_data()
66
- # Splitting the document into chunks with
67
- # predefined size and overlap
68
- parser = SentenceSplitter.from_defaults(
69
- chunk_size=256, chunk_overlap=64, paragraph_separator="\n\n"
70
- )
71
- nodes = parser.get_nodes_from_documents(docs)"""
72
 
73
 
74
- @spaces.GPU()
75
- def generate(
76
- message: str,
77
- chat_history: list[tuple[str, str]],
78
- max_new_tokens: int = 1024,
79
- temperature: float = 0.6,
80
- top_p: float = 0.9,
81
- top_k: int = 50,
82
- repetition_penalty: float = 1.2,
83
- ) -> Iterator[str]:
84
-
85
-
86
- global matched_path
87
-
88
- conversation = []
89
- for user, assistant in chat_history:
90
- conversation.extend(
91
- [
92
- {"role": "user", "content": user},
93
- {"role": "assistant", "content": assistant},
94
- ]
95
  )
96
-
97
-
98
- if not session_state["index"]:
99
-
100
- matched_path = None
101
- words = message.lower()
102
- for key, path in documents_paths.items():
103
- if key in words:
104
- matched_path = path
105
- break
106
- if matched_path:
107
- documents = load_documents(matched_path)
108
- DB = prepare_documents(documents)
109
- context, sources = get_context_sources(message, DB)
110
- print("*** sources ***", sources)
111
- gr.Info("doc preparati con ", sources)
112
-
113
- conversation.append({"role": "user", "content": f'Contesto: {context}\n\n Domanda: {message}. Rispondi in italiano'})
114
 
115
- ######
116
-
117
- """index = VectorStoreIndex(nodes)
118
- # get retriver
119
- retriever = index.as_retriever(similarity_top_k=3)
120
- relevant_chunks = retriever.retrieve(message)
121
- print(f"Found: {len(relevant_chunks)} relevant chunks")
122
- for idx, chunk in enumerate(relevant_chunks):
123
-
124
- info_message += f"{idx + 1}) {chunk.text[:64]}...\n"
125
- print(info_message)
126
- gr.Info(info_message)"""
127
-
128
- session_state["index"] = True
129
-
130
- else: ## CHIEDI CHIARIMENTO
131
-
132
- conversation.append({"role": "user", "content": f"Domanda: {message} . Comando: {INSTRUCTION_1}" })
133
- gr.Info("richiesta di chiarimento")
134
- print("******** CONV1 ", conversation)
135
-
136
-
137
-
138
- else:
139
-
140
- documents = load_documents(matched_path)
141
- DB = prepare_documents(documents)
142
- context, sources = get_context_sources(message, DB)
143
- gr.Info("contesto già indicizzato")
144
- conversation.append({"role": "user", "content": f"{INSTRUCTION_2}"})
145
- conversation.append({"role": "assistant", "content": "Ok."})
146
- conversation.append({"role": "user", "content": f'Contesto: {context}\n\n Domanda: {message}. Rispondi in italiano'})
147
-
148
- print("******** CONV2 ", conversation)
149
-
150
 
151
-
152
- # Iterate model output
153
- input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
154
- if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
155
- input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
156
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
157
- input_ids = input_ids.to(model.device)
158
-
159
- streamer = TextIteratorStreamer(tokenizer, timeout=None, skip_prompt=True, skip_special_tokens=True)
160
- generate_kwargs = dict(
161
- {"input_ids": input_ids},
162
- streamer=streamer,
163
- max_new_tokens=max_new_tokens,
164
- do_sample=True,
165
- top_p=top_p,
166
- top_k=top_k,
167
- temperature=temperature,
168
- num_beams=1,
169
- repetition_penalty=repetition_penalty,
170
- )
171
- t = Thread(target=model.generate, kwargs=generate_kwargs)
172
- t.start()
173
 
174
-
175
- outputs = []
176
- for text in streamer:
177
- outputs.append(text)
178
- yield "".join(outputs)
179
-
180
- if session_state["documents_loaded"]:
181
- outputs.append(f"Fonti utilizzate: {sources}")
182
- yield "".join(outputs)
183
-
184
- #sources = []
185
- print("debug - CHATHISTORY", chat_history)
186
-
187
- chat_interface = gr.ChatInterface(
188
- fn=generate,
189
- additional_inputs=[
190
- gr.Slider(
191
- label="Max new tokens",
192
- minimum=1,
193
- maximum=MAX_MAX_NEW_TOKENS,
194
- step=1,
195
- value=DEFAULT_MAX_NEW_TOKENS,
196
- ),
197
- gr.Slider(
198
- label="Temperature",
199
- minimum=0.1,
200
- maximum=4.0,
201
- step=0.1,
202
- value=0.6,
203
- ),
204
- gr.Slider(
205
- label="Top-p (nucleus sampling)",
206
- minimum=0.05,
207
- maximum=1.0,
208
- step=0.05,
209
- value=0.9,
210
- ),
211
- gr.Slider(
212
- label="Top-k",
213
- minimum=1,
214
- maximum=1000,
215
- step=1,
216
- value=50,
217
- ),
218
- gr.Slider(
219
- label="Repetition penalty",
220
- minimum=1.0,
221
- maximum=2.0,
222
- step=0.05,
223
- value=1.2,
224
- ),
225
- ],
226
- stop_btn=None,
227
- examples=[
228
- ["Ciao, in cosa puoi aiutarmi?"],
229
- ["Ciao, in cosa consiste un piatto di spaghetti?"],
230
- ["Ciao, quali sono le aziende che hanno iniziato ad integrare le stablecoins? Fammi un breve sommario."],
231
- ["Spiegami la differenza tra mondi virtuali pubblici o privati"],
232
- ["Trovami un esempio di progetto B2B"],
233
- ["Quali sono le regole europee sui bonifici istantanei?"],
234
- ],
235
- cache_examples=False,
236
- )
237
-
238
- with gr.Blocks(css=".gradio-container {background-color: #B9D9EB}", fill_height=True) as demo:
239
- gr.Markdown(DESCRIPTION, elem_classes="centered")
240
- chat_interface.render()
241
-
242
  if __name__ == "__main__":
243
- #demo.queue(max_size=20).launch()
244
- demo.launch()
 
 
1
+ from backend import handle_query
 
 
 
 
2
  import gradio as gr
 
 
 
 
 
 
3
 
4
 
 
 
 
5
  DESCRIPTION = """\
6
+ # <div style="text-align: center;">Odi, l'assistente ricercatore degli Osservatori</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
 
 
 
 
 
 
 
8
 
9
+ 👉 Retrieval-Augmented Generation - Ask me anything about the research carried out at the Osservatori.
10
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
 
13
+ chat_interface =gr.ChatInterface(
14
+ fn=handle_query,
15
+ chatbot=gr.Chatbot(height=500),
16
+ textbox=gr.Textbox(placeholder="Chiedimi qualasiasi cosa relativa agli Osservatori", container=False, scale=7),
17
+ #examples=[["Ciao, in cosa puoi aiutarmi?"],["Dimmi i risultati e le modalità di conduzione del censimento per favore"]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ with gr.Blocks(css=".gradio-container {background-color: #B9D9EB}") as demo:
22
+ gr.Markdown(DESCRIPTION)
23
+ #gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
24
+ chat_interface.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  if __name__ == "__main__":
27
+ #progress = gr.Progress(track_tqdm=True)
28
+ demo.launch()
29
+
backend.py CHANGED
@@ -65,7 +65,7 @@ llm = HuggingFaceLLM(
65
  model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True},
66
  )
67
 
68
- Settings.llm = GemmaLLMInterface()
69
  Settings.llm = llm
70
 
71
 
 
65
  model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True},
66
  )
67
 
68
+ #Settings.llm = GemmaLLMInterface()
69
  Settings.llm = llm
70
 
71
 
interface.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
2
+ from llama_index.core.llms import CustomLLM, LLMMetadata, CompletionResponse, CompletionResponseGen
3
+ from llama_index.core.llms.callbacks import llm_completion_callback
4
+ from typing import Any, Iterator
5
+ import torch
6
+ from transformers import TextIteratorStreamer
7
+ from threading import Thread
8
+ from pydantic import Field, field_validator
9
+ import keras
10
+ import keras_nlp
11
+
12
+ # for transformers 2 (__setattr__ is used to bypass Pydantic check )
13
+ """class GemmaLLMInterface(CustomLLM):
14
+ def __init__(self, model_id: str = "google/gemma-2-2b-it", **kwargs):
15
+ super().__init__(**kwargs)
16
+ object.__setattr__(self, "model_id", model_id)
17
+ model = AutoModelForCausalLM.from_pretrained(
18
+ model_id,
19
+ device_map="auto",
20
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
21
+ )
22
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
23
+ object.__setattr__(self, "model", model)
24
+ object.__setattr__(self, "tokenizer", tokenizer)
25
+ object.__setattr__(self, "context_window", 8192)
26
+ object.__setattr__(self, "num_output", 2048)
27
+
28
+ def _format_prompt(self, message: str) -> str:
29
+ return (
30
+ f"<start_of_turn>user\n{message}<end_of_turn>\n"
31
+ f"<start_of_turn>model\n"
32
+ )
33
+
34
+ @property
35
+ def metadata(self) -> LLMMetadata:
36
+ return LLMMetadata(
37
+ context_window=self.context_window,
38
+ num_output=self.num_output,
39
+ model_name=self.model_id,
40
+ )
41
+
42
+
43
+ @llm_completion_callback()
44
+ def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
45
+ prompt = self._format_prompt(prompt)
46
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
47
+ outputs = self.model.generate(**inputs, max_new_tokens=self.num_output)
48
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
49
+ response = response[len(prompt):].strip()
50
+ return CompletionResponse(text=response if response else "No response generated.")
51
+
52
+ @llm_completion_callback()
53
+ def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
54
+ #prompt = self._format_prompt(prompt)
55
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
56
+
57
+ streamer = TextIteratorStreamer(self.tokenizer, skip_special_tokens=True)
58
+ generation_kwargs = dict(inputs, max_new_tokens=self.num_output, streamer=streamer)
59
+
60
+ thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
61
+ thread.start()
62
+
63
+ streamed_response = ""
64
+ for new_text in streamer:
65
+ if new_text:
66
+ streamed_response += new_text
67
+ yield CompletionResponse(text=streamed_response, delta=new_text)
68
+
69
+ if not streamed_response:
70
+ yield CompletionResponse(text="No response generated.", delta="No response generated.")"""
71
+
72
+ # for Keras
73
+ class GemmaLLMInterface(CustomLLM):
74
+ model: keras_nlp.models.GemmaCausalLM = None
75
+ context_window: int = 8192
76
+ num_output: int = 2048
77
+ model_name: str = "gemma_2"
78
+
79
+ def _format_prompt(self, message: str) -> str:
80
+ return (
81
+ f"<start_of_turn>user\n{message}<end_of_turn>\n" f"<start_of_turn>model\n"
82
+ )
83
+
84
+ @property
85
+ def metadata(self) -> LLMMetadata:
86
+ """Get LLM metadata."""
87
+ return LLMMetadata(
88
+ context_window=self.context_window,
89
+ num_output=self.num_output,
90
+ model_name=self.model_name,
91
+ )
92
+
93
+ @llm_completion_callback()
94
+ def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
95
+ prompt = self._format_prompt(prompt)
96
+ raw_response = self.model.generate(prompt, max_length=self.num_output)
97
+ response = raw_response[len(prompt) :]
98
+ return CompletionResponse(text=response)
99
+
100
+ @llm_completion_callback()
101
+ def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
102
+ response = self.complete(prompt).text
103
+ for token in response:
104
+ response += token
105
+ yield CompletionResponse(text=response, delta=token)