justinj92 commited on
Commit
9f02f73
1 Parent(s): 7b4f74e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +284 -190
app.py CHANGED
@@ -1,261 +1,355 @@
1
- import gradio as gr
2
- import torch
3
- from transformers import (
4
- AutoModelForCausalLM,
5
- AutoTokenizer,
6
- TextIteratorStreamer,
7
- pipeline
8
- )
9
- import os
10
- from threading import Thread
11
- import spaces
12
- import time
13
 
14
- import langchain
15
- import os
16
- import glob
17
- import gc
18
 
19
- # loaders
20
- from langchain.document_loaders import PyPDFLoader, DirectoryLoader
21
 
22
- # splits
23
- from langchain.text_splitter import RecursiveCharacterTextSplitter
24
 
25
- # prompts
26
- from langchain import PromptTemplate
27
 
28
- # vector stores
29
- from langchain_community.vectorstores import FAISS
30
 
31
- # models
32
- from langchain.llms import HuggingFacePipeline
33
- from langchain.embeddings import HuggingFaceInstructEmbeddings
34
 
35
- # retrievers
36
- from langchain.chains import RetrievalQA
37
 
38
 
39
- import subprocess
40
 
41
- subprocess.run(
42
- "pip install flash-attn --no-build-isolation",
43
- env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
44
- shell=True,
45
- )
46
 
47
 
48
- class CFG:
49
- DEBUG = False
50
 
51
- ### LLM
52
- model_name = 'justinj92/phi3-orpo'
53
- temperature = 0.7
54
- top_p = 0.90
55
- repetition_penalty = 1.15
56
- max_len = 8192
57
- max_new_tokens = 512
58
-
59
- ### splitting
60
- split_chunk_size = 800
61
- split_overlap = 400
62
 
63
- ### embeddings
64
- embeddings_model_repo = 'BAAI/bge-base-en-v1.5'
65
 
66
- ### similar passages
67
- k = 6
68
 
69
- ### paths
70
- PDFs_path = './data'
71
- Embeddings_path = './embeddings/input'
72
- Output_folder = './ml-papers-vector'
73
 
74
- loader = DirectoryLoader(CFG.PDFs_path, glob="*.pdf", loader_cls=PyPDFLoader)
75
 
76
- documents = loader.load()
77
 
78
 
79
- text_splitter = RecursiveCharacterTextSplitter(chunk_size = CFG.split_chunk_size, chunk_overlap = CFG.split_overlap)
80
- texts = text_splitter.split_documents(documents)
81
 
82
- if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):
83
- embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
84
- vectordb = FAISS.from_documents(documents=texts, embedding=embeddings)
85
- vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers")
86
 
87
- embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
88
- vectordb = FAISS.load_local(CFG.Output_folder + '/faiss_index_ml_papers', embeddings, allow_dangerous_deserialization=True)
89
 
90
 
91
- def build_model(model_repo = CFG.model_name):
92
- tokenizer = AutoTokenizer.from_pretrained(model_repo)
93
- model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
94
- if torch.cuda.is_available():
95
- device = torch.device("cuda")
96
- print(f"Using GPU: {torch.cuda.get_device_name(device)}")
97
- else:
98
- device = torch.device("cpu")
99
- print("Using CPU")
100
- device = torch.device("cuda")
101
- model = model.to(device)
102
 
103
- return tokenizer, model
104
 
105
 
106
- tok, model = build_model(model_repo = CFG.model_name)
107
 
108
- terminators = [
109
- tok.eos_token_id,
110
- 32007,
111
- 32011,
112
- 32001,
113
- 32000
114
- ]
115
 
116
 
117
 
118
 
119
- pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
120
 
121
- llm = HuggingFacePipeline(pipeline = pipe)
122
 
123
- prompt_template = """
124
- <|system|>
125
 
126
- You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).
127
 
128
- You are given some extracted parts from machine learning papers along with a question.
129
 
130
- If you don't know the answer, just say "I don't know." Don't try to make up an answer.
131
 
132
- It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.
133
 
134
- Use only the following pieces of context to answer the question at the end.
135
 
136
- <|end|>
137
 
138
- <|user|>
139
 
140
- Context: {context}
141
 
142
- Question is below. Remember to answer in the same language:
143
 
144
- Question: {question}
145
 
146
- <|end|>
147
 
148
- <|assistant|>
149
 
150
- """
151
 
152
 
153
- PROMPT = PromptTemplate(
154
- template = prompt_template,
155
- input_variables = ["context", "question"]
156
- )
157
 
158
- retriever = vectordb.as_retriever(
159
- search_type = "similarity",
160
- search_kwargs = {"k": CFG.k}
161
- )
162
 
163
- qa_chain = RetrievalQA.from_chain_type(
164
- llm = llm,
165
- chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
166
- retriever = retriever,
167
- chain_type_kwargs = {"prompt": PROMPT},
168
- return_source_documents = True,
169
- verbose = False
170
- )
171
 
172
 
173
- def wrap_text_preserve_newlines(text, width=1500):
174
- # Split the input text into lines based on newline characters
175
- lines = text.split('\n')
176
 
177
- # Wrap each line individually
178
- wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
179
 
180
- # Join the wrapped lines back together using newline characters
181
- wrapped_text = '\n'.join(wrapped_lines)
182
 
183
- return wrapped_text
184
 
185
 
186
- def process_llm_response(llm_response):
187
- ans = wrap_text_preserve_newlines(llm_response['result'])
188
 
189
- sources_used = ' \n'.join(
190
- [
191
- source.metadata['source'].split('/')[-1][:-4]
192
- + ' - page: '
193
- + str(source.metadata['page'])
194
- for source in llm_response['source_documents']
195
- ]
196
- )
197
 
198
- ans = ans + '\n\nSources: \n' + sources_used
199
 
200
- ### return only the text after the pattern
201
- pattern = "<|assistant|>"
202
- index = ans.find(pattern)
203
- if index != -1:
204
- ans = ans[index + len(pattern):]
205
 
206
- return ans.strip()
207
 
208
- @spaces.GPU
209
- def llm_ans(message, history):
210
 
211
- llm_response = qa_chain.invoke(message)
212
- ans = process_llm_response(llm_response)
213
 
214
- return ans
215
-
216
-
217
- # @spaces.GPU(duration=60)
218
- # def chat(message, history, temperature, do_sample, max_tokens):
219
- # chat = [{"role": "system", "content": "You are ORPO Tuned Phi Beast. Answer all questions in the most helpful way. No yapping."}]
220
- # for item in history:
221
- # chat.append({"role": "user", "content": item[0]})
222
- # if item[1] is not None:
223
- # chat.append({"role": "assistant", "content": item[1]})
224
- # chat.append({"role": "user", "content": message})
225
- # messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
226
- # model_inputs = tok([messages], return_tensors="pt").to(device)
227
- # streamer = TextIteratorStreamer(
228
- # tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
229
- # )
230
- # generate_kwargs = dict(
231
- # model_inputs,
232
- # streamer=streamer,
233
- # max_new_tokens=max_tokens,
234
- # do_sample=True,
235
- # temperature=temperature,
236
- # eos_token_id=terminators,
237
- # )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
- # if temperature == 0:
240
- # generate_kwargs["do_sample"] = False
 
 
 
 
 
 
 
 
241
 
242
- # t = Thread(target=model.generate, kwargs=generate_kwargs)
243
- # t.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
- # partial_text = ""
246
- # for new_text in streamer:
247
- # partial_text += new_text
248
- # yield partial_text
249
 
250
- # yield partial_text
 
251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
- demo = gr.ChatInterface(
254
- fn=llm_ans,
255
- examples=[["Write me a poem about Machine Learning."]],
256
- # multimodal=False,
257
- stop_btn="Stop Generation",
258
- title="Chat With LLMs",
259
- description="Now Running Phi3-ORPO",
260
- )
261
- demo.launch()
 
1
+ # import gradio as gr
2
+ # import torch
3
+ # from transformers import (
4
+ # AutoModelForCausalLM,
5
+ # AutoTokenizer,
6
+ # TextIteratorStreamer,
7
+ # pipeline
8
+ # )
9
+ # import os
10
+ # from threading import Thread
11
+ # import spaces
12
+ # import time
13
 
14
+ # import langchain
15
+ # import os
16
+ # import glob
17
+ # import gc
18
 
19
+ # # loaders
20
+ # from langchain.document_loaders import PyPDFLoader, DirectoryLoader
21
 
22
+ # # splits
23
+ # from langchain.text_splitter import RecursiveCharacterTextSplitter
24
 
25
+ # # prompts
26
+ # from langchain import PromptTemplate
27
 
28
+ # # vector stores
29
+ # from langchain_community.vectorstores import FAISS
30
 
31
+ # # models
32
+ # from langchain.llms import HuggingFacePipeline
33
+ # from langchain.embeddings import HuggingFaceInstructEmbeddings
34
 
35
+ # # retrievers
36
+ # from langchain.chains import RetrievalQA
37
 
38
 
39
+ # import subprocess
40
 
41
+ # subprocess.run(
42
+ # "pip install flash-attn --no-build-isolation",
43
+ # env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
44
+ # shell=True,
45
+ # )
46
 
47
 
48
+ # class CFG:
49
+ # DEBUG = False
50
 
51
+ # ### LLM
52
+ # model_name = 'justinj92/phi3-orpo'
53
+ # temperature = 0.7
54
+ # top_p = 0.90
55
+ # repetition_penalty = 1.15
56
+ # max_len = 8192
57
+ # max_new_tokens = 512
58
+
59
+ # ### splitting
60
+ # split_chunk_size = 800
61
+ # split_overlap = 400
62
 
63
+ # ### embeddings
64
+ # embeddings_model_repo = 'BAAI/bge-base-en-v1.5'
65
 
66
+ # ### similar passages
67
+ # k = 6
68
 
69
+ # ### paths
70
+ # PDFs_path = './data'
71
+ # Embeddings_path = './embeddings/input'
72
+ # Output_folder = './ml-papers-vector'
73
 
74
+ # loader = DirectoryLoader(CFG.PDFs_path, glob="*.pdf", loader_cls=PyPDFLoader)
75
 
76
+ # documents = loader.load()
77
 
78
 
79
+ # text_splitter = RecursiveCharacterTextSplitter(chunk_size = CFG.split_chunk_size, chunk_overlap = CFG.split_overlap)
80
+ # texts = text_splitter.split_documents(documents)
81
 
82
+ # if not os.path.exists(CFG.Embeddings_path + '/index.faiss'):
83
+ # embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
84
+ # vectordb = FAISS.from_documents(documents=texts, embedding=embeddings)
85
+ # vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers")
86
 
87
+ # embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
88
+ # vectordb = FAISS.load_local(CFG.Output_folder + '/faiss_index_ml_papers', embeddings, allow_dangerous_deserialization=True)
89
 
90
 
91
+ # def build_model(model_repo = CFG.model_name):
92
+ # tokenizer = AutoTokenizer.from_pretrained(model_repo)
93
+ # model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
94
+ # if torch.cuda.is_available():
95
+ # device = torch.device("cuda")
96
+ # print(f"Using GPU: {torch.cuda.get_device_name(device)}")
97
+ # else:
98
+ # device = torch.device("cpu")
99
+ # print("Using CPU")
100
+ # device = torch.device("cuda")
101
+ # model = model.to(device)
102
 
103
+ # return tokenizer, model
104
 
105
 
106
+ # tok, model = build_model(model_repo = CFG.model_name)
107
 
108
+ # terminators = [
109
+ # tok.eos_token_id,
110
+ # 32007,
111
+ # 32011,
112
+ # 32001,
113
+ # 32000
114
+ # ]
115
 
116
 
117
 
118
 
119
+ # pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
120
 
121
+ # llm = HuggingFacePipeline(pipeline = pipe)
122
 
123
+ # prompt_template = """
124
+ # <|system|>
125
 
126
+ # You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).
127
 
128
+ # You are given some extracted parts from machine learning papers along with a question.
129
 
130
+ # If you don't know the answer, just say "I don't know." Don't try to make up an answer.
131
 
132
+ # It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.
133
 
134
+ # Use only the following pieces of context to answer the question at the end.
135
 
136
+ # <|end|>
137
 
138
+ # <|user|>
139
 
140
+ # Context: {context}
141
 
142
+ # Question is below. Remember to answer in the same language:
143
 
144
+ # Question: {question}
145
 
146
+ # <|end|>
147
 
148
+ # <|assistant|>
149
 
150
+ # """
151
 
152
 
153
+ # PROMPT = PromptTemplate(
154
+ # template = prompt_template,
155
+ # input_variables = ["context", "question"]
156
+ # )
157
 
158
+ # retriever = vectordb.as_retriever(
159
+ # search_type = "similarity",
160
+ # search_kwargs = {"k": CFG.k}
161
+ # )
162
 
163
+ # qa_chain = RetrievalQA.from_chain_type(
164
+ # llm = llm,
165
+ # chain_type = "stuff", # map_reduce, map_rerank, stuff, refine
166
+ # retriever = retriever,
167
+ # chain_type_kwargs = {"prompt": PROMPT},
168
+ # return_source_documents = True,
169
+ # verbose = False
170
+ # )
171
 
172
 
173
+ # def wrap_text_preserve_newlines(text, width=1500):
174
+ # # Split the input text into lines based on newline characters
175
+ # lines = text.split('\n')
176
 
177
+ # # Wrap each line individually
178
+ # wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
179
 
180
+ # # Join the wrapped lines back together using newline characters
181
+ # wrapped_text = '\n'.join(wrapped_lines)
182
 
183
+ # return wrapped_text
184
 
185
 
186
+ # def process_llm_response(llm_response):
187
+ # ans = wrap_text_preserve_newlines(llm_response['result'])
188
 
189
+ # sources_used = ' \n'.join(
190
+ # [
191
+ # source.metadata['source'].split('/')[-1][:-4]
192
+ # + ' - page: '
193
+ # + str(source.metadata['page'])
194
+ # for source in llm_response['source_documents']
195
+ # ]
196
+ # )
197
 
198
+ # ans = ans + '\n\nSources: \n' + sources_used
199
 
200
+ # ### return only the text after the pattern
201
+ # pattern = "<|assistant|>"
202
+ # index = ans.find(pattern)
203
+ # if index != -1:
204
+ # ans = ans[index + len(pattern):]
205
 
206
+ # return ans.strip()
207
 
208
+ # @spaces.GPU
209
+ # def llm_ans(message, history):
210
 
211
+ # llm_response = qa_chain.invoke(message)
212
+ # ans = process_llm_response(llm_response)
213
 
214
+ # return ans
215
+
216
+
217
+ # # @spaces.GPU(duration=60)
218
+ # # def chat(message, history, temperature, do_sample, max_tokens):
219
+ # # chat = [{"role": "system", "content": "You are ORPO Tuned Phi Beast. Answer all questions in the most helpful way. No yapping."}]
220
+ # # for item in history:
221
+ # # chat.append({"role": "user", "content": item[0]})
222
+ # # if item[1] is not None:
223
+ # # chat.append({"role": "assistant", "content": item[1]})
224
+ # # chat.append({"role": "user", "content": message})
225
+ # # messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
226
+ # # model_inputs = tok([messages], return_tensors="pt").to(device)
227
+ # # streamer = TextIteratorStreamer(
228
+ # # tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
229
+ # # )
230
+ # # generate_kwargs = dict(
231
+ # # model_inputs,
232
+ # # streamer=streamer,
233
+ # # max_new_tokens=max_tokens,
234
+ # # do_sample=True,
235
+ # # temperature=temperature,
236
+ # # eos_token_id=terminators,
237
+ # # )
238
+
239
+ # # if temperature == 0:
240
+ # # generate_kwargs["do_sample"] = False
241
+
242
+ # # t = Thread(target=model.generate, kwargs=generate_kwargs)
243
+ # # t.start()
244
+
245
+ # # partial_text = ""
246
+ # # for new_text in streamer:
247
+ # # partial_text += new_text
248
+ # # yield partial_text
249
+
250
+ # # yield partial_text
251
+
252
+
253
+ # demo = gr.ChatInterface(
254
+ # fn=llm_ans,
255
+ # examples=[["Write me a poem about Machine Learning."]],
256
+ # # multimodal=False,
257
+ # stop_btn="Stop Generation",
258
+ # title="Chat With LLMs",
259
+ # description="Now Running Phi3-ORPO",
260
+ # )
261
+ # demo.launch()
262
+
263
+
264
+ import gradio as gr
265
+ import torch
266
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
267
+ import os
268
+ from threading import Thread
269
 
270
+ import langchain
271
+ from langchain.document_loaders import DirectoryLoader, PyPDFLoader
272
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
273
+ from langchain import PromptTemplate
274
+ from langchain_community.vectorstores import FAISS
275
+ from langchain.llms import HuggingFacePipeline
276
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
277
+ from langchain.chains import RetrievalQA
278
+ import subprocess
279
+ import textwrap
280
 
281
+ # Installation command for specific libraries
282
+ subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)
283
+
284
+ class CFG:
285
+ DEBUG = False
286
+ model_name = 'justinj92/phi3-orpo'
287
+ temperature = 0.7
288
+ top_p = 0.90
289
+ repetition_penalty = 1.15
290
+ max_len = 8192
291
+ max_new_tokens = 512
292
+ split_chunk_size = 800
293
+ split_overlap = 400
294
+ embeddings_model_repo = 'BAAI/bge-base-en-v1.5'
295
+ k = 6
296
+ PDFs_path = './data'
297
+ Embeddings_path = './embeddings/input'
298
+ Output_folder = './ml-papers-vector'
299
 
300
+ loader = DirectoryLoader(CFG.PDFs_path, glob="*.pdf", loader_cls=PyPDFLoader)
301
+ documents = loader.load()
 
 
302
 
303
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.split_chunk_size, chunk_overlap=CFG.split_overlap)
304
+ texts = text_splitter.split_documents(documents)
305
 
306
+ if not os.path.exists(f"{CFG.Embeddings_path}/index.faiss"):
307
+ embeddings = HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
308
+ vectordb = FAISS.from_documents(documents=texts, embedding=embeddings)
309
+ vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers")
310
+
311
+ embeddings = HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo, model_kwargs={"device":"cuda"})
312
+ vectordb = FAISS.load_local(f"{CFG.Output_folder}/faiss_index_ml_papers", embeddings, allow_dangerous_deserialization=True)
313
+
314
+ @spaces.GPU
315
+ def build_model(model_repo=CFG.model_name):
316
+ tokenizer = AutoTokenizer.from_pretrained(model_repo)
317
+ model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16)
318
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
319
+ model = model.to(device)
320
+ return tokenizer, model
321
+
322
+ tok, model = build_model()
323
+
324
+ terminators = [tok.eos_token_id, 32007, 32011, 32001, 32000]
325
+
326
+ pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty)
327
+ llm = HuggingFacePipeline(pipeline=pipe)
328
+
329
+ prompt_template = """
330
+ You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs).
331
+ You are given some extracted parts from machine learning papers along with a question.
332
+ If you don't know the answer, just say "I don't know." Don't try to make up an answer.
333
+ It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that.
334
+ Use only the following pieces of context to answer the question at the end.
335
+ Context: {context}
336
+ Question is below. Remember to answer in the same language:
337
+ Question: {question}
338
+ """
339
+
340
+ PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
341
+
342
+ retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": CFG.k})
343
+ qa_chain = RetrievalQA(llm=llm, retriever=retriever, prompt_template=PROMPT, return_source_documents=True, verbose=False)
344
+
345
+ def process_llm_response(llm_response):
346
+ ans = textwrap.fill(llm_response['result'], width=1500)
347
+ sources_used = ' \n'.join([f"{source.metadata['source'].split('/')[-1][:-4]} - page: {str(source.metadata['page'])}" for source in llm_response['source_documents']])
348
+ return f"{ans}\n\nSources:\n{sources_used}"
349
+
350
+ @gr.Interface(fn=process_llm_response, inputs=["text", "state"], outputs="text", title="Chat With LLMs", description="Now Running Phi3-ORPO")
351
+ def llm_ans(message, history):
352
+ llm_response = qa_chain.invoke(message)
353
+ return process_llm_response(llm_response)
354
 
355
+ llm_ans.launch()