awacke1 commited on
Commit
1b6dd3a
1 Parent(s): 16b0459

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -188
app.py CHANGED
@@ -33,79 +33,22 @@ from PyPDF2 import PdfReader
33
  from templates import bot_template, css, user_template
34
  from xml.etree import ElementTree as ET
35
 
36
- def add_Med_Licensing_Exam_Dataset():
37
- import streamlit as st
38
- from datasets import load_dataset
39
- dataset = load_dataset("augtoma/usmle_step_1")['test'] # Using 'test' split
40
- st.title("USMLE Step 1 Dataset Viewer")
41
- if len(dataset) == 0:
42
- st.write("😢 The dataset is empty.")
43
- else:
44
- st.write("""
45
- 🔍 Use the search box to filter questions or use the grid to scroll through the dataset.
46
- """)
47
-
48
- # 👩‍🔬 Search Box
49
- search_term = st.text_input("Search for a specific question:", "")
50
- # 🎛 Pagination
51
- records_per_page = 100
52
- num_records = len(dataset)
53
- num_pages = max(int(num_records / records_per_page), 1)
54
-
55
- # Skip generating the slider if num_pages is 1 (i.e., all records fit in one page)
56
- if num_pages > 1:
57
- page_number = st.select_slider("Select page:", options=list(range(1, num_pages + 1)))
58
- else:
59
- page_number = 1 # Only one page
60
-
61
- # 📊 Display Data
62
- start_idx = (page_number - 1) * records_per_page
63
- end_idx = start_idx + records_per_page
64
-
65
- # 🧪 Apply the Search Filter
66
- filtered_data = []
67
- for record in dataset[start_idx:end_idx]:
68
- if isinstance(record, dict) and 'text' in record and 'id' in record:
69
- if search_term:
70
- if search_term.lower() in record['text'].lower():
71
- filtered_data.append(record)
72
- else:
73
- filtered_data.append(record)
74
-
75
- # 🌐 Render the Grid
76
- for record in filtered_data:
77
- st.write(f"## Question ID: {record['id']}")
78
- st.write(f"### Question:")
79
- st.write(f"{record['text']}")
80
- st.write(f"### Answer:")
81
- st.write(f"{record['answer']}")
82
- st.write("---")
83
-
84
- st.write(f"😊 Total Records: {num_records} | 📄 Displaying {start_idx+1} to {min(end_idx, num_records)}")
85
-
86
-
87
-
88
- # 1. Constants and Top Level UI Variables
89
-
90
- # My Inference API Copy
91
- # API_URL = 'https://qe55p8afio98s0u3.us-east-1.aws.endpoints.huggingface.cloud' # Dr Llama
92
- # Original:
93
- API_URL = "https://api-inference.huggingface.co/models/meta-llama/Llama-2-7b-chat-hf"
94
  API_KEY = os.getenv('API_KEY')
95
- MODEL1="meta-llama/Llama-2-7b-chat-hf"
96
- MODEL1URL="https://huggingface.co/meta-llama/Llama-2-7b-chat-hf"
97
- HF_KEY = os.getenv('HF_KEY')
98
  headers = {
99
- "Authorization": f"Bearer {HF_KEY}",
100
  "Content-Type": "application/json"
101
  }
102
  key = os.getenv('OPENAI_API_KEY')
103
  prompt = f"Write instructions to teach anyone to write a discharge plan. List the entities, features and relationships to CCDA and FHIR objects in boldface."
104
  # page config and sidebar declares up front allow all other functions to see global class variables
105
- # st.set_page_config(page_title="GPT Streamlit Document Reasoner", layout="wide")
 
 
106
  should_save = st.sidebar.checkbox("💾 Save", value=True, help="Save your session data.")
107
 
108
- # 2. Prompt label button demo for LLM
109
  def add_witty_humor_buttons():
110
  with st.expander("Wit and Humor 🤣", expanded=True):
111
  # Tip about the Dromedary family
@@ -151,40 +94,8 @@ def add_witty_humor_buttons():
151
  if col7[0].button("More Funny Rhymes 🎙️"):
152
  StreamLLMChatResponse(descriptions["More Funny Rhymes 🎙️"])
153
 
154
- def addDocumentHTML5(result):
155
- documentHTML5='''
156
- <!DOCTYPE html>
157
- <html>
158
- <head>
159
- <title>Read It Aloud</title>
160
- <script type="text/javascript">
161
- function readAloud() {
162
- const text = document.getElementById("textArea").value;
163
- const speech = new SpeechSynthesisUtterance(text);
164
- window.speechSynthesis.speak(speech);
165
- }
166
- </script>
167
- </head>
168
- <body>
169
- <h1>🔊 Read It Aloud</h1>
170
- <textarea id="textArea" rows="10" cols="80">
171
- '''
172
- documentHTML5 = documentHTML5 + result
173
- documentHTML5 = documentHTML5 + '''
174
- </textarea>
175
- <br>
176
- <button onclick="readAloud()">🔊 Read Aloud</button>
177
- </body>
178
- </html>
179
- '''
180
-
181
- import streamlit.components.v1 as components # Import Streamlit
182
- components.html(documentHTML5, width=1280, height=1024)
183
- return result
184
-
185
-
186
- # 3. Stream Llama Response
187
- # @st.cache_resource
188
  def StreamLLMChatResponse(prompt):
189
 
190
  try:
@@ -221,27 +132,27 @@ def StreamLLMChatResponse(prompt):
221
 
222
  except:
223
  st.write('Stream llm issue')
224
- add_documentHTML5(result)
225
  except:
226
- st.write('Llama model is asleep. Starting up now on A10 - please give 5 minutes then retry as KEDA scales up from zero to activate running container(s).')
 
 
227
 
228
- # 4. Run query with payload
229
  def query(payload):
230
  response = requests.post(API_URL, headers=headers, json=payload)
231
  st.markdown(response.json())
232
  return response.json()
 
233
  def get_output(prompt):
234
  return query({"inputs": prompt})
235
 
236
- # 5. Auto name generated output files from time and content
237
  def generate_filename(prompt, file_type):
238
  central = pytz.timezone('US/Central')
239
  safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
240
  replaced_prompt = prompt.replace(" ", "_").replace("\n", "_")
241
- safe_prompt = "".join(x for x in replaced_prompt if x.isalnum() or x == "_")[:45]
242
  return f"{safe_date_time}_{safe_prompt}.{file_type}"
243
 
244
- # 6. Speech transcription via OpenAI service
245
  def transcribe_audio(openai_key, file_path, model):
246
  openai.api_key = openai_key
247
  OPENAI_API_URL = "https://api.openai.com/v1/audio/transcriptions"
@@ -265,7 +176,6 @@ def transcribe_audio(openai_key, file_path, model):
265
  st.error("Error in API call.")
266
  return None
267
 
268
- # 7. Auto stop on silence audio control for recording WAV files
269
  def save_and_play_audio(audio_recorder):
270
  audio_bytes = audio_recorder(key='audio_recorder')
271
  if audio_bytes:
@@ -276,35 +186,34 @@ def save_and_play_audio(audio_recorder):
276
  return filename
277
  return None
278
 
279
- # 8. File creator that interprets type and creates output file for text, markdown and code
280
  def create_file(filename, prompt, response, should_save=True):
281
  if not should_save:
282
  return
283
  base_filename, ext = os.path.splitext(filename)
284
  has_python_code = bool(re.search(r"```python([\s\S]*?)```", response))
285
  if ext in ['.txt', '.htm', '.md']:
286
- with open(f"{base_filename}.md", 'w') as file:
287
- content = prompt.strip() + '\r\n' + response
288
- file.write(content)
 
289
  if has_python_code:
290
  python_code = re.findall(r"```python([\s\S]*?)```", response)[0].strip()
291
  with open(f"{base_filename}-Code.py", 'w') as file:
292
  file.write(python_code)
293
- with open(f"{base_filename}.md", 'w') as file:
294
- content = prompt.strip() + '\r\n' + response
295
- file.write(content)
296
 
297
  def truncate_document(document, length):
298
  return document[:length]
 
299
  def divide_document(document, max_length):
300
  return [document[i:i+max_length] for i in range(0, len(document), max_length)]
301
 
302
- # 9. Sidebar with UI controls to review and re-run prompts and continue responses
303
- @st.cache_resource
304
  def get_table_download_link(file_path):
305
  with open(file_path, 'r') as file:
306
- data = file.read()
307
-
 
 
 
308
  b64 = base64.b64encode(data.encode()).decode()
309
  file_name = os.path.basename(file_path)
310
  ext = os.path.splitext(file_name)[1] # get the file extension
@@ -325,16 +234,13 @@ def get_table_download_link(file_path):
325
  href = f'<a href="data:{mime_type};base64,{b64}" target="_blank" download="{file_name}">{file_name}</a>'
326
  return href
327
 
328
-
329
  def CompressXML(xml_text):
330
  root = ET.fromstring(xml_text)
331
  for elem in list(root.iter()):
332
  if isinstance(elem.tag, str) and 'Comment' in elem.tag:
333
  elem.parent.remove(elem)
334
  return ET.tostring(root, encoding='unicode', method="xml")
335
-
336
- # 10. Read in and provide UI for past files
337
- @st.cache_resource
338
  def read_file_content(file,max_length):
339
  if file.type == "application/json":
340
  content = json.load(file)
@@ -356,8 +262,6 @@ def read_file_content(file,max_length):
356
  else:
357
  return ""
358
 
359
- # 11. Chat with GPT - Caution on quota - now favoring fastest AI pipeline STT Whisper->LLM Llama->TTS
360
- @st.cache_resource
361
  def chat_with_model(prompt, document_section, model_choice='gpt-3.5-turbo'):
362
  model = model_choice
363
  conversation = [{'role': 'system', 'content': 'You are a helpful assistant.'}]
@@ -386,8 +290,6 @@ def chat_with_model(prompt, document_section, model_choice='gpt-3.5-turbo'):
386
  st.write(time.time() - start_time)
387
  return full_reply_content
388
 
389
- # 12. Embedding VectorDB for LLM query of documents to text to compress inputs and prompt together as Chat memory using Langchain
390
- @st.cache_resource
391
  def chat_with_file_contents(prompt, file_content, model_choice='gpt-3.5-turbo'):
392
  conversation = [{'role': 'system', 'content': 'You are a helpful assistant.'}]
393
  conversation.append({'role': 'user', 'content': prompt})
@@ -419,34 +321,31 @@ def extract_file_extension(file):
419
  else:
420
  raise ValueError(f"Unable to extract file extension from {file_name}")
421
 
422
- # Normalize input as text from PDF and other formats
423
- @st.cache_resource
424
  def pdf2txt(docs):
425
  text = ""
426
  for file in docs:
427
  file_extension = extract_file_extension(file)
428
  st.write(f"File type extension: {file_extension}")
429
- if file_extension.lower() in ['py', 'txt', 'html', 'htm', 'xml', 'json']:
430
- text += file.getvalue().decode('utf-8')
431
- elif file_extension.lower() == 'pdf':
432
- from PyPDF2 import PdfReader
433
- pdf = PdfReader(BytesIO(file.getvalue()))
434
- for page in range(len(pdf.pages)):
435
- text += pdf.pages[page].extract_text() # new PyPDF2 syntax
 
 
 
436
  return text
437
 
438
  def txt2chunks(text):
439
  text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
440
  return text_splitter.split_text(text)
441
 
442
- # Vector Store using FAISS
443
- @st.cache_resource
444
  def vector_store(text_chunks):
445
  embeddings = OpenAIEmbeddings(openai_api_key=key)
446
  return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
447
 
448
- # Memory and Retrieval chains
449
- @st.cache_resource
450
  def get_chain(vectorstore):
451
  llm = ChatOpenAI()
452
  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
@@ -479,18 +378,13 @@ def divide_prompt(prompt, max_length):
479
  chunks.append(' '.join(current_chunk))
480
  return chunks
481
 
482
-
483
- # 13. Provide way of saving all and deleting all to give way of reviewing output and saving locally before clearing it
484
-
485
- @st.cache_resource
486
  def create_zip_of_files(files):
487
  zip_name = "all_files.zip"
488
  with zipfile.ZipFile(zip_name, 'w') as zipf:
489
  for file in files:
490
  zipf.write(file)
491
  return zip_name
492
-
493
- @st.cache_resource
494
  def get_zip_download_link(zip_file):
495
  with open(zip_file, 'rb') as f:
496
  data = f.read()
@@ -498,24 +392,13 @@ def get_zip_download_link(zip_file):
498
  href = f'<a href="data:application/zip;base64,{b64}" download="{zip_file}">Download All</a>'
499
  return href
500
 
501
- # 14. Inference Endpoints for Whisper (best fastest STT) on NVIDIA T4 and Llama (best fastest AGI LLM) on NVIDIA A10
502
- # My Inference Endpoint
503
  API_URL_IE = f'https://tonpixzfvq3791u9.us-east-1.aws.endpoints.huggingface.cloud'
504
- # Original
505
- API_URL_IE = "https://api-inference.huggingface.co/models/openai/whisper-small.en"
506
- MODEL2 = "openai/whisper-small.en"
507
- MODEL2_URL = "https://huggingface.co/openai/whisper-small.en"
508
- #headers = {
509
- # "Authorization": "Bearer XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
510
- # "Content-Type": "audio/wav"
511
- #}
512
- HF_KEY = os.getenv('HF_KEY')
513
  headers = {
514
- "Authorization": f"Bearer {HF_KEY}",
515
- "Content-Type": "audio/wav"
516
  }
517
 
518
- #@st.cache_resource
519
  def query(filename):
520
  with open(filename, "rb") as f:
521
  data = f.read()
@@ -529,7 +412,7 @@ def generate_filename(prompt, file_type):
529
  safe_prompt = "".join(x for x in replaced_prompt if x.isalnum() or x == "_")[:90]
530
  return f"{safe_date_time}_{safe_prompt}.{file_type}"
531
 
532
- # 15. Audio recorder to Wav file
533
  def save_and_play_audio(audio_recorder):
534
  audio_bytes = audio_recorder()
535
  if audio_bytes:
@@ -539,7 +422,7 @@ def save_and_play_audio(audio_recorder):
539
  st.audio(audio_bytes, format="audio/wav")
540
  return filename
541
 
542
- # 16. Speech transcription to file output
543
  def transcribe_audio(filename):
544
  output = query(filename)
545
  return output
@@ -552,11 +435,7 @@ def whisper_main():
552
  filename = save_and_play_audio(audio_recorder)
553
  if filename is not None:
554
  transcription = transcribe_audio(filename)
555
- try:
556
- transcription = transcription['text']
557
- except:
558
- st.write('Whisper model is asleep. Starting up now on T4 GPU - please give 5 minutes then retry as it scales up from zero to activate running container(s).')
559
-
560
  st.write(transcription)
561
  response = StreamLLMChatResponse(transcription)
562
  # st.write(response) - redundant with streaming result?
@@ -564,8 +443,6 @@ def whisper_main():
564
  create_file(filename, transcription, response, should_save)
565
  #st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
566
 
567
-
568
- # 17. Main
569
  def main():
570
 
571
  st.title("AI Drome Llama")
@@ -584,7 +461,14 @@ def main():
584
  openai.api_key = os.getenv('OPENAI_KEY')
585
  menu = ["txt", "htm", "xlsx", "csv", "md", "py"]
586
  choice = st.sidebar.selectbox("Output File Type:", menu)
587
- model_choice = st.sidebar.radio("Select Model:", ('gpt-3.5-turbo', 'gpt-3.5-turbo-0301'))
 
 
 
 
 
 
 
588
  user_prompt = st.text_area("Enter prompts, instructions & questions:", '', height=100)
589
  collength, colupload = st.columns([2,3]) # adjust the ratio as needed
590
  with collength:
@@ -628,8 +512,6 @@ def main():
628
  filename = generate_filename(user_prompt, choice)
629
  create_file(filename, user_prompt, response, should_save)
630
  st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
631
-
632
- # Compose a file sidebar of past encounters
633
  all_files = glob.glob("*.*")
634
  all_files = [file for file in all_files if len(os.path.splitext(file)[0]) >= 20] # exclude files with short names
635
  all_files.sort(key=lambda x: (os.path.splitext(x)[1], x), reverse=True) # sort by file type and file name in descending order
@@ -665,31 +547,17 @@ def main():
665
  if st.button("🗑", key="delete_"+file):
666
  os.remove(file)
667
  st.experimental_rerun()
668
-
669
-
670
  if len(file_contents) > 0:
671
  if next_action=='open':
672
  file_content_area = st.text_area("File Contents:", file_contents, height=500)
673
- addDocumentHTML5(file_contents)
674
  if next_action=='md':
675
  st.markdown(file_contents)
676
- addDocumentHTML5(file_contents)
677
  if next_action=='search':
678
  file_content_area = st.text_area("File Contents:", file_contents, height=500)
679
  st.write('Reasoning with your inputs...')
680
-
681
- # new - llama
682
- response = StreamLLMChatResponse(file_contents)
683
- filename = generate_filename(user_prompt, ".md")
684
- #create_file(filename, response, '', should_save)
685
- #addDocumentHTML5(file_contents)
686
- addDocumentHTML5(response)
687
-
688
- # old - gpt
689
- #response = chat_with_model(user_prompt, file_contents, model_choice)
690
- #filename = generate_filename(file_contents, choice)
691
- #create_file(filename, user_prompt, response, should_save)
692
-
693
  st.experimental_rerun()
694
 
695
  # Feedback
@@ -720,8 +588,6 @@ def main():
720
  filename = generate_filename(raw, 'txt')
721
  create_file(filename, raw, '', should_save)
722
 
723
- # 18. Run AI Pipeline
724
  if __name__ == "__main__":
725
  whisper_main()
726
  main()
727
-
 
33
  from templates import bot_template, css, user_template
34
  from xml.etree import ElementTree as ET
35
 
36
+ # Llama Constants
37
+ API_URL = 'https://qe55p8afio98s0u3.us-east-1.aws.endpoints.huggingface.cloud' # Dr Llama
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  API_KEY = os.getenv('API_KEY')
 
 
 
39
  headers = {
40
+ "Authorization": f"Bearer {API_KEY}",
41
  "Content-Type": "application/json"
42
  }
43
  key = os.getenv('OPENAI_API_KEY')
44
  prompt = f"Write instructions to teach anyone to write a discharge plan. List the entities, features and relationships to CCDA and FHIR objects in boldface."
45
  # page config and sidebar declares up front allow all other functions to see global class variables
46
+ st.set_page_config(page_title="GPT Streamlit Document Reasoner", layout="wide")
47
+
48
+ # UI Controls
49
  should_save = st.sidebar.checkbox("💾 Save", value=True, help="Save your session data.")
50
 
51
+ # Function to add witty and humor buttons
52
  def add_witty_humor_buttons():
53
  with st.expander("Wit and Humor 🤣", expanded=True):
54
  # Tip about the Dromedary family
 
94
  if col7[0].button("More Funny Rhymes 🎙️"):
95
  StreamLLMChatResponse(descriptions["More Funny Rhymes 🎙️"])
96
 
97
+
98
+ # Function to Stream Inference Client for Inference Endpoint Responses
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  def StreamLLMChatResponse(prompt):
100
 
101
  try:
 
132
 
133
  except:
134
  st.write('Stream llm issue')
135
+ return result
136
  except:
137
+ st.write('DromeLlama is asleep. Starting up now on A10 - please give 5 minutes then retry as KEDA scales up from zero to activate running container(s).')
138
+
139
+
140
 
 
141
  def query(payload):
142
  response = requests.post(API_URL, headers=headers, json=payload)
143
  st.markdown(response.json())
144
  return response.json()
145
+
146
  def get_output(prompt):
147
  return query({"inputs": prompt})
148
 
 
149
  def generate_filename(prompt, file_type):
150
  central = pytz.timezone('US/Central')
151
  safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
152
  replaced_prompt = prompt.replace(" ", "_").replace("\n", "_")
153
+ safe_prompt = "".join(x for x in replaced_prompt if x.isalnum() or x == "_")[:90]
154
  return f"{safe_date_time}_{safe_prompt}.{file_type}"
155
 
 
156
  def transcribe_audio(openai_key, file_path, model):
157
  openai.api_key = openai_key
158
  OPENAI_API_URL = "https://api.openai.com/v1/audio/transcriptions"
 
176
  st.error("Error in API call.")
177
  return None
178
 
 
179
  def save_and_play_audio(audio_recorder):
180
  audio_bytes = audio_recorder(key='audio_recorder')
181
  if audio_bytes:
 
186
  return filename
187
  return None
188
 
 
189
  def create_file(filename, prompt, response, should_save=True):
190
  if not should_save:
191
  return
192
  base_filename, ext = os.path.splitext(filename)
193
  has_python_code = bool(re.search(r"```python([\s\S]*?)```", response))
194
  if ext in ['.txt', '.htm', '.md']:
195
+ with open(f"{base_filename}-Prompt.txt", 'w') as file:
196
+ file.write(prompt.strip())
197
+ with open(f"{base_filename}-Response.md", 'w') as file:
198
+ file.write(response)
199
  if has_python_code:
200
  python_code = re.findall(r"```python([\s\S]*?)```", response)[0].strip()
201
  with open(f"{base_filename}-Code.py", 'w') as file:
202
  file.write(python_code)
 
 
 
203
 
204
  def truncate_document(document, length):
205
  return document[:length]
206
+
207
  def divide_document(document, max_length):
208
  return [document[i:i+max_length] for i in range(0, len(document), max_length)]
209
 
 
 
210
  def get_table_download_link(file_path):
211
  with open(file_path, 'r') as file:
212
+ try:
213
+ data = file.read()
214
+ except:
215
+ st.write('')
216
+ return file_path
217
  b64 = base64.b64encode(data.encode()).decode()
218
  file_name = os.path.basename(file_path)
219
  ext = os.path.splitext(file_name)[1] # get the file extension
 
234
  href = f'<a href="data:{mime_type};base64,{b64}" target="_blank" download="{file_name}">{file_name}</a>'
235
  return href
236
 
 
237
  def CompressXML(xml_text):
238
  root = ET.fromstring(xml_text)
239
  for elem in list(root.iter()):
240
  if isinstance(elem.tag, str) and 'Comment' in elem.tag:
241
  elem.parent.remove(elem)
242
  return ET.tostring(root, encoding='unicode', method="xml")
243
+
 
 
244
  def read_file_content(file,max_length):
245
  if file.type == "application/json":
246
  content = json.load(file)
 
262
  else:
263
  return ""
264
 
 
 
265
  def chat_with_model(prompt, document_section, model_choice='gpt-3.5-turbo'):
266
  model = model_choice
267
  conversation = [{'role': 'system', 'content': 'You are a helpful assistant.'}]
 
290
  st.write(time.time() - start_time)
291
  return full_reply_content
292
 
 
 
293
  def chat_with_file_contents(prompt, file_content, model_choice='gpt-3.5-turbo'):
294
  conversation = [{'role': 'system', 'content': 'You are a helpful assistant.'}]
295
  conversation.append({'role': 'user', 'content': prompt})
 
321
  else:
322
  raise ValueError(f"Unable to extract file extension from {file_name}")
323
 
 
 
324
  def pdf2txt(docs):
325
  text = ""
326
  for file in docs:
327
  file_extension = extract_file_extension(file)
328
  st.write(f"File type extension: {file_extension}")
329
+ try:
330
+ if file_extension.lower() in ['py', 'txt', 'html', 'htm', 'xml', 'json']:
331
+ text += file.getvalue().decode('utf-8')
332
+ elif file_extension.lower() == 'pdf':
333
+ from PyPDF2 import PdfReader
334
+ pdf = PdfReader(BytesIO(file.getvalue()))
335
+ for page in range(len(pdf.pages)):
336
+ text += pdf.pages[page].extract_text() # new PyPDF2 syntax
337
+ except Exception as e:
338
+ st.write(f"Error processing file {file.name}: {e}")
339
  return text
340
 
341
  def txt2chunks(text):
342
  text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
343
  return text_splitter.split_text(text)
344
 
 
 
345
  def vector_store(text_chunks):
346
  embeddings = OpenAIEmbeddings(openai_api_key=key)
347
  return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
348
 
 
 
349
  def get_chain(vectorstore):
350
  llm = ChatOpenAI()
351
  memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
 
378
  chunks.append(' '.join(current_chunk))
379
  return chunks
380
 
 
 
 
 
381
  def create_zip_of_files(files):
382
  zip_name = "all_files.zip"
383
  with zipfile.ZipFile(zip_name, 'w') as zipf:
384
  for file in files:
385
  zipf.write(file)
386
  return zip_name
387
+
 
388
  def get_zip_download_link(zip_file):
389
  with open(zip_file, 'rb') as f:
390
  data = f.read()
 
392
  href = f'<a href="data:application/zip;base64,{b64}" download="{zip_file}">Download All</a>'
393
  return href
394
 
395
+
 
396
  API_URL_IE = f'https://tonpixzfvq3791u9.us-east-1.aws.endpoints.huggingface.cloud'
 
 
 
 
 
 
 
 
 
397
  headers = {
398
+ "Authorization": "Bearer XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX",
399
+ "Content-Type": "audio/wav"
400
  }
401
 
 
402
  def query(filename):
403
  with open(filename, "rb") as f:
404
  data = f.read()
 
412
  safe_prompt = "".join(x for x in replaced_prompt if x.isalnum() or x == "_")[:90]
413
  return f"{safe_date_time}_{safe_prompt}.{file_type}"
414
 
415
+ # 10. Audio recorder to Wav file:
416
  def save_and_play_audio(audio_recorder):
417
  audio_bytes = audio_recorder()
418
  if audio_bytes:
 
422
  st.audio(audio_bytes, format="audio/wav")
423
  return filename
424
 
425
+ # 9B. Speech transcription to file output - OPENAI Whisper
426
  def transcribe_audio(filename):
427
  output = query(filename)
428
  return output
 
435
  filename = save_and_play_audio(audio_recorder)
436
  if filename is not None:
437
  transcription = transcribe_audio(filename)
438
+ transcription = transcription['text']
 
 
 
 
439
  st.write(transcription)
440
  response = StreamLLMChatResponse(transcription)
441
  # st.write(response) - redundant with streaming result?
 
443
  create_file(filename, transcription, response, should_save)
444
  #st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
445
 
 
 
446
  def main():
447
 
448
  st.title("AI Drome Llama")
 
461
  openai.api_key = os.getenv('OPENAI_KEY')
462
  menu = ["txt", "htm", "xlsx", "csv", "md", "py"]
463
  choice = st.sidebar.selectbox("Output File Type:", menu)
464
+ model_choice = st.sidebar.radio("Select Model:", ('gpt-3.5-turbo', 'gpt-3.5-turbo-0301'))
465
+
466
+ #filename = save_and_play_audio(audio_recorder)
467
+ #if filename is not None:
468
+ # transcription = transcribe_audio(key, filename, "whisper-1")
469
+ # st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
470
+ # filename = None
471
+
472
  user_prompt = st.text_area("Enter prompts, instructions & questions:", '', height=100)
473
  collength, colupload = st.columns([2,3]) # adjust the ratio as needed
474
  with collength:
 
512
  filename = generate_filename(user_prompt, choice)
513
  create_file(filename, user_prompt, response, should_save)
514
  st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
 
 
515
  all_files = glob.glob("*.*")
516
  all_files = [file for file in all_files if len(os.path.splitext(file)[0]) >= 20] # exclude files with short names
517
  all_files.sort(key=lambda x: (os.path.splitext(x)[1], x), reverse=True) # sort by file type and file name in descending order
 
547
  if st.button("🗑", key="delete_"+file):
548
  os.remove(file)
549
  st.experimental_rerun()
 
 
550
  if len(file_contents) > 0:
551
  if next_action=='open':
552
  file_content_area = st.text_area("File Contents:", file_contents, height=500)
 
553
  if next_action=='md':
554
  st.markdown(file_contents)
 
555
  if next_action=='search':
556
  file_content_area = st.text_area("File Contents:", file_contents, height=500)
557
  st.write('Reasoning with your inputs...')
558
+ response = chat_with_model(user_prompt, file_contents, model_choice)
559
+ filename = generate_filename(file_contents, choice)
560
+ create_file(filename, user_prompt, response, should_save)
 
 
 
 
 
 
 
 
 
 
561
  st.experimental_rerun()
562
 
563
  # Feedback
 
588
  filename = generate_filename(raw, 'txt')
589
  create_file(filename, raw, '', should_save)
590
 
 
591
  if __name__ == "__main__":
592
  whisper_main()
593
  main()