cokoli1 commited on
Commit
b97de2f
·
1 Parent(s): ff3185c

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -473
app.py DELETED
@@ -1,473 +0,0 @@
1
- Hugging Face's logo
2
- Hugging Face
3
-
4
-
5
-
6
- Spaces:
7
-
8
- awacke1
9
- /
10
- ChatGPT-Memory-Chat-Story-Generator
11
-
12
- like
13
- 11
14
- App
15
-
16
- Files
17
- Community
18
- ChatGPT-Memory-Chat-Story-Generator
19
- /
20
- app.py
21
- awacke1's picture
22
- awacke1
23
- Update app.py
24
- 03c8b68
25
- raw
26
- history
27
- blame
28
- contribute
29
- delete
30
- No virus
31
- 17.6 kB
32
- import streamlit as st
33
- import openai
34
- import os
35
- import base64
36
- import glob
37
- import json
38
- import mistune
39
- import pytz
40
- import math
41
- import requests
42
- import time
43
- import re
44
- import textract
45
-
46
- from datetime import datetime
47
- from openai import ChatCompletion
48
- from xml.etree import ElementTree as ET
49
- from bs4 import BeautifulSoup
50
- from collections import deque
51
- from audio_recorder_streamlit import audio_recorder
52
-
53
- from dotenv import load_dotenv
54
- from PyPDF2 import PdfReader
55
- from langchain.text_splitter import CharacterTextSplitter
56
- from langchain.embeddings import OpenAIEmbeddings
57
- from langchain.vectorstores import FAISS
58
- from langchain.chat_models import ChatOpenAI
59
- from langchain.memory import ConversationBufferMemory
60
- from langchain.chains import ConversationalRetrievalChain
61
- from templates import css, bot_template, user_template
62
-
63
-
64
-
65
- def generate_filename(prompt, file_type):
66
- central = pytz.timezone('US/Central')
67
- safe_date_time = datetime.now(central).strftime("%m%d_%H%M") # Date and time DD-HHMM
68
- safe_prompt = "".join(x for x in prompt if x.isalnum())[:90] # Limit file name size and trim whitespace
69
- return f"{safe_date_time}_{safe_prompt}.{file_type}" # Return a safe file name
70
-
71
-
72
- def transcribe_audio(openai_key, file_path, model):
73
- OPENAI_API_URL = "https://api.openai.com/v1/audio/transcriptions"
74
- headers = {
75
- "Authorization": f"Bearer {openai_key}",
76
- }
77
- with open(file_path, 'rb') as f:
78
- data = {'file': f}
79
- response = requests.post(OPENAI_API_URL, headers=headers, files=data, data={'model': model})
80
- if response.status_code == 200:
81
- st.write(response.json())
82
- chatResponse = chat_with_model(response.json().get('text'), '') # *************************************
83
- transcript = response.json().get('text')
84
- #st.write('Responses:')
85
- #st.write(chatResponse)
86
- filename = generate_filename(transcript, 'txt')
87
- create_file(filename, transcript, chatResponse)
88
- return transcript
89
- else:
90
- st.write(response.json())
91
- st.error("Error in API call.")
92
- return None
93
-
94
- def save_and_play_audio(audio_recorder):
95
- audio_bytes = audio_recorder()
96
- if audio_bytes:
97
- filename = generate_filename("Recording", "wav")
98
- with open(filename, 'wb') as f:
99
- f.write(audio_bytes)
100
- st.audio(audio_bytes, format="audio/wav")
101
- return filename
102
- return None
103
-
104
- def create_file(filename, prompt, response):
105
- if filename.endswith(".txt"):
106
- with open(filename, 'w') as file:
107
- file.write(f"{prompt}\n{response}")
108
- elif filename.endswith(".htm"):
109
- with open(filename, 'w') as file:
110
- file.write(f"{prompt} {response}")
111
- elif filename.endswith(".md"):
112
- with open(filename, 'w') as file:
113
- file.write(f"{prompt}\n\n{response}")
114
-
115
- def truncate_document(document, length):
116
- return document[:length]
117
- def divide_document(document, max_length):
118
- return [document[i:i+max_length] for i in range(0, len(document), max_length)]
119
-
120
- def get_table_download_link(file_path):
121
- with open(file_path, 'r') as file:
122
- try:
123
- data = file.read()
124
- except:
125
- st.write('')
126
- return file_path
127
- b64 = base64.b64encode(data.encode()).decode()
128
- file_name = os.path.basename(file_path)
129
- ext = os.path.splitext(file_name)[1] # get the file extension
130
- if ext == '.txt':
131
- mime_type = 'text/plain'
132
- elif ext == '.py':
133
- mime_type = 'text/plain'
134
- elif ext == '.xlsx':
135
- mime_type = 'text/plain'
136
- elif ext == '.csv':
137
- mime_type = 'text/plain'
138
- elif ext == '.htm':
139
- mime_type = 'text/html'
140
- elif ext == '.md':
141
- mime_type = 'text/markdown'
142
- else:
143
- mime_type = 'application/octet-stream' # general binary data type
144
- href = f'<a href="data:{mime_type};base64,{b64}" target="_blank" download="{file_name}">{file_name}</a>'
145
- return href
146
-
147
- def CompressXML(xml_text):
148
- root = ET.fromstring(xml_text)
149
- for elem in list(root.iter()):
150
- if isinstance(elem.tag, str) and 'Comment' in elem.tag:
151
- elem.parent.remove(elem)
152
- return ET.tostring(root, encoding='unicode', method="xml")
153
-
154
- def read_file_content(file,max_length):
155
- if file.type == "application/json":
156
- content = json.load(file)
157
- return str(content)
158
- elif file.type == "text/html" or file.type == "text/htm":
159
- content = BeautifulSoup(file, "html.parser")
160
- return content.text
161
- elif file.type == "application/xml" or file.type == "text/xml":
162
- tree = ET.parse(file)
163
- root = tree.getroot()
164
- xml = CompressXML(ET.tostring(root, encoding='unicode'))
165
- return xml
166
- elif file.type == "text/markdown" or file.type == "text/md":
167
- md = mistune.create_markdown()
168
- content = md(file.read().decode())
169
- return content
170
- elif file.type == "text/plain":
171
- return file.getvalue().decode()
172
- else:
173
- return ""
174
-
175
- def chat_with_model(prompt, document_section, model_choice='gpt-3.5-turbo'):
176
- model = model_choice
177
- conversation = [{'role': 'system', 'content': 'You are a helpful assistant.'}]
178
- conversation.append({'role': 'user', 'content': prompt})
179
- if len(document_section)>0:
180
- conversation.append({'role': 'assistant', 'content': document_section})
181
-
182
- start_time = time.time()
183
- report = []
184
- res_box = st.empty()
185
- collected_chunks = []
186
- collected_messages = []
187
-
188
- for chunk in openai.ChatCompletion.create(
189
- model='gpt-3.5-turbo',
190
- messages=conversation,
191
- temperature=0.5,
192
- stream=True
193
- ):
194
-
195
- collected_chunks.append(chunk) # save the event response
196
- chunk_message = chunk['choices'][0]['delta'] # extract the message
197
- collected_messages.append(chunk_message) # save the message
198
-
199
- content=chunk["choices"][0].get("delta",{}).get("content")
200
-
201
- try:
202
- report.append(content)
203
- if len(content) > 0:
204
- result = "".join(report).strip()
205
- #result = result.replace("\n", "")
206
- res_box.markdown(f'*{result}*')
207
- except:
208
- st.write(' ')
209
-
210
- full_reply_content = ''.join([m.get('content', '') for m in collected_messages])
211
- st.write("Elapsed time:")
212
- st.write(time.time() - start_time)
213
- return full_reply_content
214
-
215
- def chat_with_file_contents(prompt, file_content, model_choice='gpt-3.5-turbo'):
216
- conversation = [{'role': 'system', 'content': 'You are a helpful assistant.'}]
217
- conversation.append({'role': 'user', 'content': prompt})
218
- if len(file_content)>0:
219
- conversation.append({'role': 'assistant', 'content': file_content})
220
- response = openai.ChatCompletion.create(model=model_choice, messages=conversation)
221
- return response['choices'][0]['message']['content']
222
-
223
- def extract_mime_type(file):
224
- # Check if the input is a string
225
- if isinstance(file, str):
226
- pattern = r"type='(.*?)'"
227
- match = re.search(pattern, file)
228
- if match:
229
- return match.group(1)
230
- else:
231
- raise ValueError(f"Unable to extract MIME type from {file}")
232
- # If it's not a string, assume it's a streamlit.UploadedFile object
233
- elif isinstance(file, streamlit.UploadedFile):
234
- return file.type
235
- else:
236
- raise TypeError("Input should be a string or a streamlit.UploadedFile object")
237
-
238
- from io import BytesIO
239
- import re
240
-
241
- def extract_file_extension(file):
242
- # get the file name directly from the UploadedFile object
243
- file_name = file.name
244
- pattern = r".*?\.(.*?)$"
245
- match = re.search(pattern, file_name)
246
- if match:
247
- return match.group(1)
248
- else:
249
- raise ValueError(f"Unable to extract file extension from {file_name}")
250
-
251
- def pdf2txt(docs):
252
- text = ""
253
- for file in docs:
254
- file_extension = extract_file_extension(file)
255
- # print the file extension
256
- st.write(f"File type extension: {file_extension}")
257
-
258
- # read the file according to its extension
259
- try:
260
- if file_extension.lower() in ['py', 'txt', 'html', 'htm', 'xml', 'json']:
261
- text += file.getvalue().decode('utf-8')
262
- elif file_extension.lower() == 'pdf':
263
- from PyPDF2 import PdfReader
264
- pdf = PdfReader(BytesIO(file.getvalue()))
265
- for page in range(len(pdf.pages)):
266
- text += pdf.pages[page].extract_text() # new PyPDF2 syntax
267
- except Exception as e:
268
- st.write(f"Error processing file {file.name}: {e}")
269
-
270
- return text
271
-
272
- def pdf2txt_old(pdf_docs):
273
- st.write(pdf_docs)
274
- for file in pdf_docs:
275
- mime_type = extract_mime_type(file)
276
- st.write(f"MIME type of file: {mime_type}")
277
-
278
- text = ""
279
- for pdf in pdf_docs:
280
- pdf_reader = PdfReader(pdf)
281
- for page in pdf_reader.pages:
282
- text += page.extract_text()
283
- return text
284
-
285
- def txt2chunks(text):
286
- text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=200, length_function=len)
287
- return text_splitter.split_text(text)
288
-
289
- def vector_store(text_chunks):
290
- key = os.getenv('OPENAI_API_KEY')
291
- embeddings = OpenAIEmbeddings(openai_api_key=key)
292
- return FAISS.from_texts(texts=text_chunks, embedding=embeddings)
293
-
294
- def get_chain(vectorstore):
295
- llm = ChatOpenAI()
296
- memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)
297
- return ConversationalRetrievalChain.from_llm(llm=llm, retriever=vectorstore.as_retriever(), memory=memory)
298
-
299
- def process_user_input(user_question):
300
- response = st.session_state.conversation({'question': user_question})
301
- st.session_state.chat_history = response['chat_history']
302
- for i, message in enumerate(st.session_state.chat_history):
303
- template = user_template if i % 2 == 0 else bot_template
304
- st.write(template.replace("{{MSG}}", message.content), unsafe_allow_html=True)
305
- # Save file output from PDF query results
306
- filename = generate_filename(user_question, 'txt')
307
- create_file(filename, user_question, message.content)
308
-
309
- #st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
310
-
311
- def divide_prompt(prompt, max_length):
312
- words = prompt.split()
313
- chunks = []
314
- current_chunk = []
315
- current_length = 0
316
- for word in words:
317
- if len(word) + current_length <= max_length:
318
- current_length += len(word) + 1 # Adding 1 to account for spaces
319
- current_chunk.append(word)
320
- else:
321
- chunks.append(' '.join(current_chunk))
322
- current_chunk = [word]
323
- current_length = len(word)
324
- chunks.append(' '.join(current_chunk)) # Append the final chunk
325
- return chunks
326
-
327
- def main():
328
- # Sidebar and global
329
- openai.api_key = os.getenv('OPENAI_API_KEY')
330
- st.set_page_config(page_title="GPT Streamlit Document Reasoner",layout="wide")
331
-
332
- # File type for output, model choice
333
- menu = ["txt", "htm", "xlsx", "csv", "md", "py"] #619
334
- choice = st.sidebar.selectbox("Output File Type:", menu)
335
- model_choice = st.sidebar.radio("Select Model:", ('gpt-3.5-turbo', 'gpt-3.5-turbo-0301'))
336
-
337
- # Audio, transcribe, GPT:
338
- filename = save_and_play_audio(audio_recorder)
339
- if filename is not None:
340
- transcription = transcribe_audio(openai.api_key, filename, "whisper-1")
341
- st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
342
- filename=None # since transcription is finished next time just use the saved transcript
343
-
344
- # prompt interfaces
345
- user_prompt = st.text_area("Enter prompts, instructions & questions:", '', height=100)
346
-
347
- # file section interface for prompts against large documents as context
348
- collength, colupload = st.columns([2,3]) # adjust the ratio as needed
349
- with collength:
350
- max_length = st.slider("File section length for large files", min_value=1000, max_value=128000, value=12000, step=1000)
351
- with colupload:
352
- uploaded_file = st.file_uploader("Add a file for context:", type=["pdf", "xml", "json", "xlsx","csv","html", "htm", "md", "txt"])
353
-
354
- # Document section chat
355
- document_sections = deque()
356
- document_responses = {}
357
- if uploaded_file is not None:
358
- file_content = read_file_content(uploaded_file, max_length)
359
- document_sections.extend(divide_document(file_content, max_length))
360
- if len(document_sections) > 0:
361
- if st.button("👁️ View Upload"):
362
- st.markdown("**Sections of the uploaded file:**")
363
- for i, section in enumerate(list(document_sections)):
364
- st.markdown(f"**Section {i+1}**\n{section}")
365
- st.markdown("**Chat with the model:**")
366
- for i, section in enumerate(list(document_sections)):
367
- if i in document_responses:
368
- st.markdown(f"**Section {i+1}**\n{document_responses[i]}")
369
- else:
370
- if st.button(f"Chat about Section {i+1}"):
371
- st.write('Reasoning with your inputs...')
372
- response = chat_with_model(user_prompt, section, model_choice) # *************************************
373
- st.write('Response:')
374
- st.write(response)
375
- document_responses[i] = response
376
- filename = generate_filename(f"{user_prompt}_section_{i+1}", choice)
377
- create_file(filename, user_prompt, response)
378
- st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
379
-
380
- if st.button('💬 Chat'):
381
- st.write('Reasoning with your inputs...')
382
-
383
- #response = chat_with_model(user_prompt, ''.join(list(document_sections,)), model_choice) # *************************************
384
-
385
- # Divide the user_prompt into smaller sections
386
- user_prompt_sections = divide_prompt(user_prompt, max_length)
387
- full_response = ''
388
- for prompt_section in user_prompt_sections:
389
- # Process each section with the model
390
- response = chat_with_model(prompt_section, ''.join(list(document_sections)), model_choice)
391
- full_response += response + '\n' # Combine the responses
392
-
393
- #st.write('Response:')
394
- #st.write(full_response)
395
-
396
- response = full_response
397
- st.write('Response:')
398
- st.write(response)
399
-
400
- filename = generate_filename(user_prompt, choice)
401
- create_file(filename, user_prompt, response)
402
- st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
403
-
404
- all_files = glob.glob("*.*")
405
- all_files = [file for file in all_files if len(os.path.splitext(file)[0]) >= 20] # exclude files with short names
406
- all_files.sort(key=lambda x: (os.path.splitext(x)[1], x), reverse=True) # sort by file type and file name in descending order
407
-
408
- # sidebar of files
409
- file_contents=''
410
- next_action=''
411
- for file in all_files:
412
- col1, col2, col3, col4, col5 = st.sidebar.columns([1,6,1,1,1]) # adjust the ratio as needed
413
- with col1:
414
- if st.button("🌐", key="md_"+file): # md emoji button
415
- with open(file, 'r') as f:
416
- file_contents = f.read()
417
- next_action='md'
418
- with col2:
419
- st.markdown(get_table_download_link(file), unsafe_allow_html=True)
420
- with col3:
421
- if st.button("📂", key="open_"+file): # open emoji button
422
- with open(file, 'r') as f:
423
- file_contents = f.read()
424
- next_action='open'
425
- with col4:
426
- if st.button("🔍", key="read_"+file): # search emoji button
427
- with open(file, 'r') as f:
428
- file_contents = f.read()
429
- next_action='search'
430
- with col5:
431
- if st.button("🗑", key="delete_"+file):
432
- os.remove(file)
433
- st.experimental_rerun()
434
-
435
- if len(file_contents) > 0:
436
- if next_action=='open':
437
- file_content_area = st.text_area("File Contents:", file_contents, height=500)
438
- if next_action=='md':
439
- st.markdown(file_contents)
440
- if next_action=='search':
441
- file_content_area = st.text_area("File Contents:", file_contents, height=500)
442
- st.write('Reasoning with your inputs...')
443
- response = chat_with_model(user_prompt, file_contents, model_choice)
444
- filename = generate_filename(file_contents, choice)
445
- create_file(filename, file_contents, response)
446
-
447
- st.experimental_rerun()
448
- #st.sidebar.markdown(get_table_download_link(filename), unsafe_allow_html=True)
449
-
450
- if __name__ == "__main__":
451
- main()
452
-
453
- load_dotenv()
454
- st.write(css, unsafe_allow_html=True)
455
-
456
- st.header("Chat with documents :books:")
457
- user_question = st.text_input("Ask a question about your documents:")
458
- if user_question:
459
- process_user_input(user_question)
460
-
461
- with st.sidebar:
462
- st.subheader("Your documents")
463
- docs = st.file_uploader("import documents", accept_multiple_files=True)
464
- with st.spinner("Processing"):
465
- raw = pdf2txt(docs)
466
- if len(raw) > 0:
467
- length = str(len(raw))
468
- text_chunks = txt2chunks(raw)
469
- vectorstore = vector_store(text_chunks)
470
- st.session_state.conversation = get_chain(vectorstore)
471
- st.markdown('# AI Search Index of Length:' + length + ' Created.') # add timing
472
- filename = generate_filename(raw, 'txt')
473
- create_file(filename, raw, '')