syedmudassir16 commited on
Commit
d4b9099
1 Parent(s): 939b7da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -184
app.py CHANGED
@@ -1,5 +1,3 @@
1
- "Single Thread"
2
-
3
  import os
4
  import multiprocessing
5
  import concurrent.futures
@@ -10,18 +8,19 @@ from sentence_transformers import SentenceTransformer
10
  import faiss
11
  import torch
12
  import numpy as np
13
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig
14
  from datetime import datetime
15
  import json
16
  import gradio as gr
17
- import re
 
18
 
19
  class DocumentRetrievalAndGeneration:
20
  def __init__(self, embedding_model_name, lm_model_id, data_folder):
21
  self.all_splits = self.load_documents(data_folder)
22
  self.embeddings = SentenceTransformer(embedding_model_name)
23
  self.gpu_index = self.create_faiss_index()
24
- self.llm = self.initialize_llm(lm_model_id)
25
 
26
  def load_documents(self, folder_path):
27
  loader = DirectoryLoader(folder_path, loader_cls=TextLoader)
@@ -30,7 +29,7 @@ class DocumentRetrievalAndGeneration:
30
  all_splits = text_splitter.split_documents(documents)
31
  print('Length of documents:', len(documents))
32
  print("LEN of all_splits", len(all_splits))
33
- for i in range(5):
34
  print(all_splits[i].page_content)
35
  return all_splits
36
 
@@ -44,124 +43,101 @@ class DocumentRetrievalAndGeneration:
44
  return gpu_index
45
 
46
  def initialize_llm(self, model_id):
47
- bnb_config = BitsAndBytesConfig(
48
  load_in_4bit=True,
49
  bnb_4bit_use_double_quant=True,
50
  bnb_4bit_quant_type="nf4",
51
  bnb_4bit_compute_dtype=torch.bfloat16
52
  )
53
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
54
- model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
55
  tokenizer = AutoTokenizer.from_pretrained(model_id)
56
- generate_text = pipeline(
57
- model=model,
58
- tokenizer=tokenizer,
59
- return_full_text=True,
60
- task='text-generation',
61
- temperature=0.6,
62
- max_new_tokens=256,
63
  )
64
- return generate_text
65
 
66
- def generate_response_with_timeout(self, model_inputs):
67
  try:
68
- with concurrent.futures.ThreadPoolExecutor() as executor:
69
- future = executor.submit(self.llm.model.generate, model_inputs, max_new_tokens=1000, do_sample=True)
70
- generated_ids = future.result(timeout=60) # Timeout set to 60 seconds
71
- return generated_ids
72
- except concurrent.futures.TimeoutError:
73
- raise TimeoutError("Text generation process timed out")
74
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  def query_and_generate_response(self, query):
 
76
  query_embedding = self.embeddings.encode(query, convert_to_tensor=True).cpu().numpy()
77
- distances, indices = self.gpu_index.search(np.array([query_embedding]), k=5)
78
-
79
  content = ""
80
- for idx in indices[0]:
 
 
 
 
 
81
  content += "-" * 50 + "\n"
82
  content += self.all_splits[idx].page_content + "\n"
83
  print("CHUNK", idx)
 
 
84
  print(self.all_splits[idx].page_content)
85
  print("############################")
86
- prompt = f"""<s>
87
- You are a knowledgeable assistant with access to a comprehensive database.
88
- I need you to answer my question and provide related information in a specific format.
89
- I have provided five relatable json files {content}, choose the most suitable chunks for answering the query
90
- Here's what I need:
91
- Include a final answer without additional comments, sign-offs, or extra phrases. Be direct and to the point.
92
- content
93
- Here's my question:
94
- Query:{query}
95
- Solution==>
96
- RETURN ONLY SOLUTION . IF THEIR IS NO ANSWER RELATABLE IN RETRIEVED CHUNKS , RETURN " NO SOLUTION AVAILABLE"
97
- Example1
98
- Query: "How to use IPU1_0 instead of A15_0 to process NDK in TDA2x-EVM",
99
- Solution: "To use IPU1_0 instead of A15_0 to process NDK in TDA2x-EVM, you need to modify the configuration file of the NDK application. Specifically, change the processor reference from 'A15_0' to 'IPU1_0'.",
100
-
101
- Example2
102
- Query: "Can BQ25896 support I2C interface?",
103
- Solution: "Yes, the BQ25896 charger supports the I2C interface for communication."
104
- </s>
105
- """
106
- prompt2 = f"""
107
- <s>
108
- You are a knowledgeable assistant with access to a comprehensive database.
109
- I need you to answer my question and provide related information in a specific format.
110
- I have provided five relatable JSON files. Choose the most suitable chunks for answering the query.
111
- Here's what I need:
112
- Include a final answer without additional comments, sign-offs, or extra phrases. Be direct and to the point.
113
-
114
- Examples:
115
- Example1:
116
- Query: "How to use IPU1_0 instead of A15_0 to process NDK in TDA2x-EVM",
117
- Solution: "To use IPU1_0 instead of A15_0 to process NDK in TDA2x-EVM, you need to modify the configuration file of the NDK application. Specifically, change the processor reference from 'A15_0' to 'IPU1_0'."
118
-
119
- Example2:
120
- Query: "Can BQ25896 support I2C interface?",
121
- Solution: "Yes, the BQ25896 charger supports the I2C interface for communication."
122
-
123
- content: {content}
124
-
125
- Here's my question:
126
- Query: {query}
127
-
128
- Solution==>
129
- RETURN ONLY SOLUTION. IF THERE IS NO ANSWER RELATABLE IN RETRIEVED CHUNKS, RETURN "NO SOLUTION AVAILABLE"
130
- </s>
131
- """
132
- # prompt = f"Query: {query}\nSolution: {content}\n"
133
 
134
- # Encode and prepare inputs
135
- messages = [{"role": "user", "content": prompt2}]
136
- encodeds = self.llm.tokenizer.apply_chat_template(messages, return_tensors="pt")
137
- model_inputs = encodeds.to(self.llm.device)
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- # Perform inference and measure time
140
  start_time = datetime.now()
141
- generated_ids = self.generate_response_with_timeout(model_inputs)
142
- # generated_ids = self.llm.model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
143
  elapsed_time = datetime.now() - start_time
144
 
145
- # Decode and return output
146
- decoded = self.llm.tokenizer.batch_decode(generated_ids)
147
- generated_response = decoded[0]
148
- match1 = re.search(r'\[/INST\](.*?)</s>', generated_response, re.DOTALL)
149
-
150
- match2 = re.search(r'Solution:(.*?)</s>', generated_response, re.DOTALL | re.IGNORECASE)
151
- if match1:
152
- solution_text = match1.group(1).strip()
153
- print(solution_text)
154
- if "Solution:" in solution_text:
155
- solution_text = solution_text.split("Solution:", 1)[1].strip()
156
- elif match2:
157
- solution_text = match2.group(1).strip()
158
- print(solution_text)
159
-
160
- else:
161
- solution_text=generated_response
162
  print("Generated response:", generated_response)
163
  print("Time elapsed:", elapsed_time)
164
- print("Device in use:", self.llm.device)
 
 
 
 
 
 
 
 
165
 
166
  return solution_text, content
167
 
@@ -170,29 +146,25 @@ class DocumentRetrievalAndGeneration:
170
  return response
171
 
172
  if __name__ == "__main__":
173
- # Example usage
174
  embedding_model_name = 'flax-sentence-embeddings/all_datasets_v3_MiniLM-L12'
175
  lm_model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
176
  data_folder = 'sample_embedding_folder2'
177
 
178
  doc_retrieval_gen = DocumentRetrievalAndGeneration(embedding_model_name, lm_model_id, data_folder)
179
 
180
- """Dual Interface"""
181
-
182
  def launch_interface():
183
  css_code = """
184
  .gradio-container {
185
  background-color: #daccdb;
186
  }
187
- /* Button styling for all buttons */
188
  button {
189
- background-color: #927fc7; /* Default color for all other buttons */
190
  color: black;
191
  border: 1px solid black;
192
  padding: 10px;
193
  margin-right: 10px;
194
- font-size: 16px; /* Increase font size */
195
- font-weight: bold; /* Make text bold */
196
  }
197
  """
198
  EXAMPLES = [
@@ -200,88 +172,18 @@ if __name__ == "__main__":
200
  "I'm using Code Composer Studio 5.4.0.00091 and enabled FPv4SPD16 floating point support for CortexM4 in TDA2. However, after building the project, the .asm file shows --float_support=vfplib instead of FPv4SPD16. Why is this happening?",
201
  "Could you clarify the maximum number of cameras that can be connected simultaneously to the video input ports on the TDA2x SoC, considering it supports up to 10 multiplexed input ports and includes 3 dedicated video input modules?"
202
  ]
203
-
204
- file_path = "ticketNames.txt"
205
-
206
- # Read the file content
207
- with open(file_path, "r") as file:
208
- content = file.read()
209
- ticket_names = json.loads(content)
210
- dropdown = gr.Dropdown(label="Sample queries", choices=ticket_names)
211
-
212
- # Define Gradio interfaces
213
- tab1 = gr.Interface(
214
  fn=doc_retrieval_gen.qa_infer_gradio,
215
  inputs=[gr.Textbox(label="QUERY", placeholder="Enter your query here")],
216
  allow_flagging='never',
217
  examples=EXAMPLES,
218
  cache_examples=False,
219
- outputs=[gr.Textbox(label="SOLUTION"), gr.Textbox(label="RELATED QUERIES")],
220
- css=css_code
 
221
  )
222
- tab2 = gr.Interface(
223
- fn=doc_retrieval_gen.qa_infer_gradio,
224
- inputs=[dropdown],
225
- allow_flagging='never',
226
- outputs=[gr.Textbox(label="SOLUTION"), gr.Textbox(label="RELATED QUERIES")],
227
- css=css_code
228
- )
229
-
230
- # Combine interfaces into a tabbed interface
231
- gr.TabbedInterface(
232
- [tab1, tab2],
233
- ["Textbox Input", "FAQs"],
234
- title="TI E2E FORUM",
235
- css=css_code
236
- ).launch(debug=True)
237
-
238
- # Launch the interface
239
- launch_interface()
240
-
241
-
242
-
243
- """Single Interface"""
244
- # def launch_interface():
245
- # css_code = """
246
- # .gradio-container {
247
- # background-color: #daccdb;
248
- # }
249
- # /* Button styling for all buttons */
250
- # button {
251
- # background-color: #927fc7; /* Default color for all other buttons */
252
- # color: black;
253
- # border: 1px solid black;
254
- # padding: 10px;
255
- # margin-right: 10px;
256
- # font-size: 16px; /* Increase font size */
257
- # font-weight: bold; /* Make text bold */
258
- # }
259
- # """
260
- # EXAMPLES = ["On which devices can the VIP and CSI2 modules operate simultaneously? ",
261
- # "I'm using Code Composer Studio 5.4.0.00091 and enabled FPv4SPD16 floating point support for CortexM4 in TDA2. However, after building the project, the .asm file shows --float_support=vfplib instead of FPv4SPD16. Why is this happening?",
262
- # "Could you clarify the maximum number of cameras that can be connected simultaneously to the video input ports on the TDA2x SoC, considering it supports up to 10 multiplexed input ports and includes 3 dedicated video input modules?"]
263
-
264
- # file_path = "ticketNames.txt"
265
-
266
- # # Read the file content
267
- # with open(file_path, "r") as file:
268
- # content = file.read()
269
- # ticket_names = json.loads(content)
270
- # dropdown = gr.Dropdown(label="Sample queries", choices=ticket_names)
271
-
272
- # # Define Gradio interface
273
- # interface = gr.Interface(
274
- # fn=doc_retrieval_gen.qa_infer_gradio,
275
- # inputs=[gr.Textbox(label="QUERY", placeholder="Enter your query here")],
276
- # allow_flagging='never',
277
- # examples=EXAMPLES,
278
- # cache_examples=False,
279
- # outputs=[gr.Textbox(label="SOLUTION"), gr.Textbox(label="RELATED QUERIES")],
280
- # css=css_code
281
- # )
282
 
283
- # # Launch Gradio interface
284
- # interface.launch(debug=True)
285
 
286
- # # Launch the interface
287
- # launch_interface()
 
 
 
1
  import os
2
  import multiprocessing
3
  import concurrent.futures
 
8
  import faiss
9
  import torch
10
  import numpy as np
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
12
  from datetime import datetime
13
  import json
14
  import gradio as gr
15
+ import re
16
+ from threading import Thread
17
 
18
  class DocumentRetrievalAndGeneration:
19
  def __init__(self, embedding_model_name, lm_model_id, data_folder):
20
  self.all_splits = self.load_documents(data_folder)
21
  self.embeddings = SentenceTransformer(embedding_model_name)
22
  self.gpu_index = self.create_faiss_index()
23
+ self.tokenizer, self.model = self.initialize_llm(lm_model_id)
24
 
25
  def load_documents(self, folder_path):
26
  loader = DirectoryLoader(folder_path, loader_cls=TextLoader)
 
29
  all_splits = text_splitter.split_documents(documents)
30
  print('Length of documents:', len(documents))
31
  print("LEN of all_splits", len(all_splits))
32
+ for i in range(3):
33
  print(all_splits[i].page_content)
34
  return all_splits
35
 
 
43
  return gpu_index
44
 
45
  def initialize_llm(self, model_id):
46
+ quantization_config = BitsAndBytesConfig(
47
  load_in_4bit=True,
48
  bnb_4bit_use_double_quant=True,
49
  bnb_4bit_quant_type="nf4",
50
  bnb_4bit_compute_dtype=torch.bfloat16
51
  )
 
 
52
  tokenizer = AutoTokenizer.from_pretrained(model_id)
53
+ model = AutoModelForCausalLM.from_pretrained(
54
+ model_id,
55
+ torch_dtype=torch.bfloat16,
56
+ device_map="auto",
57
+
58
+ quantization_config=quantization_config
 
59
  )
60
+ return tokenizer, model
61
 
62
+ def generate_response_with_timeout(self, input_ids, max_new_tokens=1000):
63
  try:
64
+ streamer = TextIteratorStreamer(self.tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
65
+ generate_kwargs = dict(
66
+ input_ids=input_ids,
67
+ max_new_tokens=max_new_tokens,
68
+ do_sample=True,
69
+ top_p=1.0,
70
+ top_k=20,
71
+ temperature=0.8,
72
+ repetition_penalty=1.2,
73
+ eos_token_id=[128001, 128008, 128009],
74
+ streamer=streamer,
75
+ )
76
+
77
+ thread = Thread(target=self.model.generate, kwargs=generate_kwargs)
78
+ thread.start()
79
+
80
+ generated_text = ""
81
+ for new_text in streamer:
82
+ generated_text += new_text
83
+
84
+ return generated_text
85
+ except Exception as e:
86
+ print(f"Error in generate_response_with_timeout: {str(e)}")
87
+ return "Text generation process encountered an error"
88
+
89
  def query_and_generate_response(self, query):
90
+ similarityThreshold = 1
91
  query_embedding = self.embeddings.encode(query, convert_to_tensor=True).cpu().numpy()
92
+ distances, indices = self.gpu_index.search(np.array([query_embedding]), k=3)
93
+ print("Distance", distances, "indices", indices)
94
  content = ""
95
+ filtered_results = []
96
+ for idx, distance in zip(indices[0], distances[0]):
97
+ if distance <= similarityThreshold:
98
+ filtered_results.append(idx)
99
+ for i in filtered_results:
100
+ print(self.all_splits[i].page_content)
101
  content += "-" * 50 + "\n"
102
  content += self.all_splits[idx].page_content + "\n"
103
  print("CHUNK", idx)
104
+ print("Distance:", distance)
105
+ print("indices:", indices)
106
  print(self.all_splits[idx].page_content)
107
  print("############################")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ conversation = [
110
+ {"role": "system", "content": "You are a knowledgeable assistant with access to a comprehensive database."},
111
+ {"role": "user", "content": f"""
112
+ I need you to answer my question and provide related information in a specific format.
113
+ I have provided five relatable json files {content}, choose the most suitable chunks for answering the query.
114
+ RETURN ONLY SOLUTION without additional comments, sign-offs, retrived chunks, refrence to any Ticket or extra phrases. Be direct and to the point.
115
+ IF THERE IS NO ANSWER RELATABLE IN RETRIEVED CHUNKS, RETURN "NO SOLUTION AVAILABLE".
116
+ DO NOT GIVE REFRENCE TO ANY CHUNKS OR TICKETS,BE ON POINT.
117
+
118
+ Here's my question:
119
+ Query: {query}
120
+ Solution==>
121
+ """}
122
+ ]
123
+ #Include a final answer without additional comments, sign-offs, or extra phrases. Be direct and to the point.
124
+ input_ids = self.tokenizer.apply_chat_template(conversation, return_tensors="pt").to(self.model.device)
125
 
 
126
  start_time = datetime.now()
127
+ generated_response = self.generate_response_with_timeout(input_ids)
 
128
  elapsed_time = datetime.now() - start_time
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  print("Generated response:", generated_response)
131
  print("Time elapsed:", elapsed_time)
132
+ print("Device in use:", self.model.device)
133
+
134
+ solution_text = generated_response.strip()
135
+ if "Solution:" in solution_text:
136
+ solution_text = solution_text.split("Solution:", 1)[1].strip()
137
+
138
+ # Post-processing to remove "assistant" prefix
139
+ solution_text = re.sub(r'^assistant\s*', '', solution_text, flags=re.IGNORECASE)
140
+ solution_text = solution_text.strip()
141
 
142
  return solution_text, content
143
 
 
146
  return response
147
 
148
  if __name__ == "__main__":
 
149
  embedding_model_name = 'flax-sentence-embeddings/all_datasets_v3_MiniLM-L12'
150
  lm_model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
151
  data_folder = 'sample_embedding_folder2'
152
 
153
  doc_retrieval_gen = DocumentRetrievalAndGeneration(embedding_model_name, lm_model_id, data_folder)
154
 
 
 
155
  def launch_interface():
156
  css_code = """
157
  .gradio-container {
158
  background-color: #daccdb;
159
  }
 
160
  button {
161
+ background-color: #927fc7;
162
  color: black;
163
  border: 1px solid black;
164
  padding: 10px;
165
  margin-right: 10px;
166
+ font-size: 16px;
167
+ font-weight: bold;
168
  }
169
  """
170
  EXAMPLES = [
 
172
  "I'm using Code Composer Studio 5.4.0.00091 and enabled FPv4SPD16 floating point support for CortexM4 in TDA2. However, after building the project, the .asm file shows --float_support=vfplib instead of FPv4SPD16. Why is this happening?",
173
  "Could you clarify the maximum number of cameras that can be connected simultaneously to the video input ports on the TDA2x SoC, considering it supports up to 10 multiplexed input ports and includes 3 dedicated video input modules?"
174
  ]
175
+
176
+ interface = gr.Interface(
 
 
 
 
 
 
 
 
 
177
  fn=doc_retrieval_gen.qa_infer_gradio,
178
  inputs=[gr.Textbox(label="QUERY", placeholder="Enter your query here")],
179
  allow_flagging='never',
180
  examples=EXAMPLES,
181
  cache_examples=False,
182
+ outputs=[gr.Textbox(label="RESPONSE"), gr.Textbox(label="RELATED QUERIES")],
183
+ css=css_code,
184
+ title="TI E2E FORUM"
185
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
+ interface.launch(debug=True)
 
188
 
189
+ launch_interface()