JPBianchi commited on
Commit
083cd31
1 Parent(s): 0cc727c

updated app

Browse files
Files changed (5) hide show
  1. app.py +31 -24
  2. app.shell.py +143 -0
  3. app_features.py +179 -0
  4. debug.py +23 -0
  5. prompt_templates_luis.py +0 -63
app.py CHANGED
@@ -316,27 +316,30 @@ def main():
316
 
317
  st.write("Experimental and time limited 2'")
318
  finetune_model = st.toggle('Finetune on Modal A100 GPU', False)
319
- if finetune_model:
320
- from finetune_backend import finetune
321
- if 'finetuned' in model_name_or_path:
322
- st.write("Model already finetuned")
323
- elif model_name_or_path.startswith("models/"):
324
- st.write("Sentence Transformers models only!")
325
- else:
326
- try:
327
- if 'finetuned' in finetune_model:
328
- st.write("Model already finetuned")
329
- else:
330
- model_path = finetune(model_name_or_path, savemodel=True, outpath='models')
331
- if model_path is not None:
332
- if finetune_model.split('/')[-1] not in model_path:
333
- st.write(model_path) # a warning from finetuning in this case
334
- elif model_path not in available_models:
335
- # finetuning generated a model, let's add it
336
- available_models.append(model_path)
337
- st.write("Model saved!")
338
- except Exception:
339
- st.write("Model not found on HF or error")
 
 
 
340
 
341
  model_name_or_path = check_model(model_name_or_path)
342
  client, available_classes = get_weaviate_client(Wapi_key, url, model_name_or_path, openai_api_key)
@@ -404,7 +407,7 @@ def main():
404
  # best solution I found to be able to change the text inside a text_input box afterwards, using a key
405
  query = textbox.text_input(msg,
406
  value="",
407
- placeholder="You can refer to the guest with pronoun or drop the question mark",
408
  key=st.session_state.key)
409
 
410
  # st.write(f"Guest = {guest}")
@@ -438,8 +441,12 @@ def main():
438
  # let's use Llama2 here
439
  reworded_query = reword_query(query, guest,
440
  model_name='llama2-13b-chat')
441
- query = reworded_query['rewritten_question']
442
-
 
 
 
 
443
  # we can arrive here only if a guest was selected
444
  where_filter = WhereFilter(path=['guest'], operator='Equal', valueText=guest).todict() \
445
  if hybrid_filter else None
 
316
 
317
  st.write("Experimental and time limited 2'")
318
  finetune_model = st.toggle('Finetune on Modal A100 GPU', False)
319
+ if we_are_not_online:
320
+ if finetune_model:
321
+ from finetune_backend import finetune
322
+ if 'finetuned' in model_name_or_path:
323
+ st.write("Model already finetuned")
324
+ elif model_name_or_path.startswith("models/"):
325
+ st.write("Sentence Transformers models only!")
326
+ else:
327
+ try:
328
+ if 'finetuned' in finetune_model:
329
+ st.write("Model already finetuned")
330
+ else:
331
+ model_path = finetune(model_name_or_path, savemodel=True, outpath='models')
332
+ if model_path is not None:
333
+ if finetune_model.split('/')[-1] not in model_path:
334
+ st.write(model_path) # a warning from finetuning in this case
335
+ elif model_path not in available_models:
336
+ # finetuning generated a model, let's add it
337
+ available_models.append(model_path)
338
+ st.write("Model saved!")
339
+ except Exception:
340
+ st.write("Model not found on HF or error")
341
+ else:
342
+ st.write("Finetuning not available on Streamlit online because of space limitations")
343
 
344
  model_name_or_path = check_model(model_name_or_path)
345
  client, available_classes = get_weaviate_client(Wapi_key, url, model_name_or_path, openai_api_key)
 
407
  # best solution I found to be able to change the text inside a text_input box afterwards, using a key
408
  query = textbox.text_input(msg,
409
  value="",
410
+ placeholder="You can refer to the guest with PRONOUNS",
411
  key=st.session_state.key)
412
 
413
  # st.write(f"Guest = {guest}")
 
441
  # let's use Llama2 here
442
  reworded_query = reword_query(query, guest,
443
  model_name='llama2-13b-chat')
444
+ new_query = reworded_query['rewritten_question']
445
+ if guest.split(' ')[1] not in new_query and guest.split(' ')[0] not in new_query:
446
+ # if the guest name is not in the rewritten question, we add it
447
+ new_query = f"About {guest}, " + new_query.lowerleft()
448
+ query = new_query
449
+
450
  # we can arrive here only if a guest was selected
451
  where_filter = WhereFilter(path=['guest'], operator='Equal', valueText=guest).todict() \
452
  if hybrid_filter else None
app.shell.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from tiktoken import get_encoding
2
+ from weaviate_interface import WeaviateClient
3
+ from prompt_templates import question_answering_prompt_series, question_answering_system
4
+ from openai_interface import GPT_Turbo
5
+ from app_features import (convert_seconds, generate_prompt_series, search_result,
6
+ validate_token_threshold, load_content_cache, load_data)
7
+ from reranker import ReRanker
8
+ from loguru import logger
9
+ import streamlit as st
10
+ import sys
11
+ import json
12
+ import os
13
+
14
+ # load environment variables
15
+ from dotenv import load_dotenv
16
+ load_dotenv('.env', override=True)
17
+
18
+ ## PAGE CONFIGURATION
19
+ st.set_page_config(page_title="Impact Theory",
20
+ page_icon=None,
21
+ layout="wide",
22
+ initial_sidebar_state="auto",
23
+ menu_items=None)
24
+ ##############
25
+ # START CODE #
26
+ ##############
27
+ data_path = 'data/impact_theory_data.json'
28
+ cache_path = 'data/impact_theory_cache.parquet'
29
+ data = load_data(data_path)
30
+ cache = load_content_cache(cache_path)
31
+ ## RETRIEVER
32
+ client.display_properties.append('summary')
33
+ ## RERANKER
34
+
35
+ ## LLM
36
+
37
+ ## ENCODING
38
+
39
+ ## INDEX NAME
40
+
41
+ ##############
42
+ # END CODE #
43
+ ##############
44
+ data = load_data(data_path)
45
+ #creates list of guests for sidebar
46
+ guest_list = sorted(list(set([d['guest'] for d in data])))
47
+
48
+ def main():
49
+
50
+ with st.sidebar:
51
+ guest = st.selectbox('Select Guest', options=guest_list, index=None, placeholder='Select Guest')
52
+
53
+ st.image('./assets/impact-theory-logo.png', width=400)
54
+ st.subheader(f"Chat with the Impact Theory podcast: ")
55
+ st.write('\n')
56
+ col1, _ = st.columns([7,3])
57
+ with col1:
58
+ query = st.text_input('Enter your question: ')
59
+ st.write('\n\n\n\n\n')
60
+
61
+ if query:
62
+ ##############
63
+ # START CODE #
64
+ ##############
65
+
66
+ st.write('Hmmm...this app does not seem to be working yet. Please check back later.')
67
+ if guest:
68
+ st.write(f'However, it looks like you selected {guest} as a filter.')
69
+ # make hybrid call to weaviate
70
+ hybrid_response = None
71
+ # rerank results
72
+ ranked_response = None
73
+ # validate token count is below threshold
74
+ # valid_response = validate_token_threshold(ranked_response,
75
+ # question_answering_prompt_series,
76
+ # query=query,
77
+ # tokenizer= # variable from ENCODING,
78
+ # token_threshold=4000,
79
+ # verbose=True)
80
+ ##############
81
+ # END CODE #
82
+ ##############
83
+
84
+ # # generate LLM prompt
85
+ # prompt = generate_prompt_series(query=query, results=valid_response)
86
+
87
+ # # prep for streaming response
88
+ # st.subheader("Response from Impact Theory (context)")
89
+ # with st.spinner('Generating Response...'):
90
+ # st.markdown("----")
91
+ # #creates container for LLM response
92
+ # chat_container, response_box = [], st.empty()
93
+ #
94
+ # # execute chat call to LLM
95
+ # ##############
96
+ # # START CODE #
97
+ # ##############
98
+ #
99
+
100
+ # ##############
101
+ # # END CODE #
102
+ # ##############
103
+ # try:
104
+ #inserts chat stream from LLM
105
+ # with response_box:
106
+ # content = resp.choices[0].delta.content
107
+ # if content:
108
+ # chat_container.append(content)
109
+ # result = "".join(chat_container).strip()
110
+ # st.write(f'{result}')
111
+ # except Exception as e:
112
+ # print(e)
113
+ # continue
114
+ # ##############
115
+ # # START CODE #
116
+ # ##############
117
+ # st.subheader("Search Results")
118
+ # for i, hit in enumerate(valid_response):
119
+ # col1, col2 = st.columns([7, 3], gap='large')
120
+ # image = # get thumbnail_url
121
+ # episode_url = # get episode_url
122
+ # title = # get title
123
+ # show_length = # get length
124
+ # time_string = # convert show_length to readable time string
125
+ # ##############
126
+ # # END CODE #
127
+ # ##############
128
+ # with col1:
129
+ # st.write( search_result( i=i,
130
+ # url=episode_url,
131
+ # guest=hit['guest'],
132
+ # title=title,
133
+ # content=hit['content'],
134
+ # length=time_string),
135
+ # unsafe_allow_html=True)
136
+ # st.write('\n\n')
137
+ # with col2:
138
+ # # st.write(f"<a href={episode_url} <img src={image} width='200'></a>",
139
+ # # unsafe_allow_html=True)
140
+ # st.image(image, caption=title.split('|')[0], width=200, use_column_width=False)
141
+
142
+ if __name__ == '__main__':
143
+ main()
app_features.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import json
3
+ from preprocessing import FileIO
4
+ from typing import List, Optional
5
+ import tiktoken
6
+ from loguru import logger
7
+ from prompt_templates import context_block, question_answering_prompt_series
8
+ import streamlit as st
9
+
10
+ @st.cache_data
11
+ def load_content_cache(data_path: str):
12
+ data = FileIO().load_parquet(data_path)
13
+ content_data = {d['doc_id']: d['content'] for d in data}
14
+ return content_data
15
+
16
+ @st.cache_data
17
+ def load_data(data_path: str):
18
+ with open(data_path, 'r') as f:
19
+ data = json.load(f)
20
+ return data
21
+
22
+ def convert_seconds(seconds: int):
23
+ """
24
+ Converts seconds to a string of format Hours:Minutes:Seconds
25
+ """
26
+ return time.strftime("%H:%M:%S", time.gmtime(seconds))
27
+
28
+ def expand_content(ranked_results: List[dict],
29
+ content_cache: Optional[dict] = None,
30
+ content_key: str = 'doc_id',
31
+ create_new_list: bool = False
32
+ ) -> List[dict]:
33
+ '''
34
+ Updates or creates a list of ranked results with content from a cache.
35
+
36
+ This function iterates over a list of dictionaries representing ranked results.
37
+ If a cache is provided, it adds or updates the 'content' key in each dictionary
38
+ with the corresponding content from the cache based on the content_key.
39
+
40
+ Args:
41
+ - ranked_results (List[dict]): A list of dictionaries, each representing a ranked result.
42
+ - content_cache (Optional[dict]): A dictionary that maps content_key to content.
43
+ If None, the content of ranked results will not be updated.
44
+ - content_key (str): The key used in both the ranked results and content cache to match
45
+ the ranked results with their corresponding content in the cache.
46
+ - create_new_list (bool): If True, a new list of dictionaries will be created and
47
+ returned with the content updated. If False, the ranked_results will be updated in place.
48
+
49
+ Returns:
50
+ - List[dict]: A new list with updated content if create_new_list is True; otherwise,
51
+ the original ranked_results list with updated content.
52
+
53
+ Note:
54
+ - If create_new_list is False, the function will mutate the original ranked_results list.
55
+ - The function only updates content if the content_key exists in both the ranked result
56
+ and the content cache.
57
+
58
+ Example:
59
+ ```
60
+ ranked_results = [{'doc_id': '123', 'title': 'Title 1'}, {'doc_id': '456', 'title': 'Title 2'}]
61
+ content_cache = {'123': 'Content for 123', '456': 'Content for 456'}
62
+ updated_results = expand_content(ranked_results, content_cache, create_new_list=True)
63
+ # updated_results is now [{'doc_id': '123', 'title': 'Title 1', 'content': 'Content for 123'},
64
+ # {'doc_id': '456', 'title': 'Title 2', 'content': 'Content for 456'}]
65
+ ```
66
+ '''
67
+ if create_new_list:
68
+ expanded_response = [{k:v for k, v in resp.items()} for resp in ranked_results]
69
+ if content_cache is not None:
70
+ for resp in expanded_response:
71
+ if resp[content_key] in content_cache:
72
+ resp['content'] = content_cache[resp[content_key]]
73
+ return expanded_response
74
+ else:
75
+ for resp in ranked_results:
76
+ if content_cache and resp[content_key] in content_cache:
77
+ resp['content'] = content_cache[resp[content_key]]
78
+ return ranked_results
79
+
80
+
81
+ def generate_prompt_series(query: str, results: List[dict]) -> str:
82
+ """
83
+ Generates a prompt for the OpenAI API by joining the context blocks of the top results.
84
+ Provides context to the LLM by supplying the summary, guest, and retrieved content of each result.
85
+ You MUST make it easily readable, i.e. add newlines and indentation to create well-separated paragraphs.
86
+
87
+ Args:
88
+ -----
89
+ query : str
90
+ User query
91
+ results : List[dict]
92
+ List of results from the Weaviate client
93
+ """
94
+ context_series = '\n'.join([context_block.format(summary=res['summary'],
95
+ guest=res['guest'], \
96
+ transcript=res['content']) for res in results]).strip()
97
+
98
+ prompt = question_answering_prompt_series.format(question=query, series=context_series)
99
+ return prompt
100
+
101
+ def validate_token_threshold(ranked_results: List[dict],
102
+ base_prompt: str,
103
+ query: str,
104
+ tokenizer: tiktoken.Encoding,
105
+ token_threshold: int,
106
+ verbose: bool = False
107
+ ) -> List[dict]:
108
+ """
109
+ Validates that prompt is below the set token threshold by adding lengths of:
110
+ 1. Base prompt
111
+ 2. User query
112
+ 3. Context material
113
+ If threshold is exceeded, context results are reduced incrementally until the
114
+ combined prompt tokens are below the threshold. This function does not take into
115
+ account every token passed to the LLM, but it is a good approximation.
116
+ """
117
+ overhead_len = len(tokenizer.encode(base_prompt.format(question=query, series='')))
118
+ context_len = _get_batch_length(ranked_results, tokenizer)
119
+
120
+ token_count = overhead_len + context_len
121
+ if token_count > token_threshold:
122
+ print('Token count exceeds token count threshold, reducing size of returned results below token threshold')
123
+
124
+ while token_count > token_threshold and len(ranked_results) > 1:
125
+ num_results = len(ranked_results)
126
+
127
+ # remove the last ranked (most irrelevant) result
128
+ ranked_results = ranked_results[:num_results-1]
129
+ # recalculate new token_count
130
+ token_count = overhead_len + _get_batch_length(ranked_results, tokenizer)
131
+
132
+ if verbose:
133
+ logger.info(f'Total Final Token Count: {token_count}')
134
+ return ranked_results
135
+
136
+ def _get_batch_length(ranked_results: List[dict], tokenizer: tiktoken.Encoding) -> int:
137
+ '''
138
+ Convenience function to get the length in tokens of a batch of results
139
+ '''
140
+ contexts = tokenizer.encode_batch([r['content'] for r in ranked_results])
141
+ context_len = sum(list(map(len, contexts)))
142
+ return context_len
143
+
144
+ def search_result(i: int,
145
+ url: str,
146
+ title: str,
147
+ content: str,
148
+ guest: str,
149
+ length: str,
150
+ space: str='&nbsp; &nbsp;'
151
+ ) -> str:
152
+
153
+ '''
154
+ HTML to display search results.
155
+
156
+ Args:
157
+ -----
158
+ i: int
159
+ index of search result
160
+ url: str
161
+ url of YouTube video
162
+ title: str
163
+ title of episode
164
+ content: str
165
+ content chunk of episode
166
+ '''
167
+ return f"""
168
+ <div style="font-size:120%;">
169
+ {i + 1}.<a href="{url}">{title}</a>
170
+ </div>
171
+
172
+ <div style="font-size:95%;">
173
+ <p>Episode Length: {length} {space}{space} Guest: {guest}</p>
174
+ <div style="color:grey;float:left;">
175
+ ...
176
+ </div>
177
+ {content}
178
+ </div>
179
+ """
debug.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import tiktoken
3
+ from llama_index.text_splitter import SentenceSplitter
4
+
5
+ d = {'title': "THE BIG AI RESET: The Next Global SuperPower Isn't Who You Think | Ian Bremmer",
6
+ 'video_id': 'nXJBccSwtB8',
7
+ 'playlist_id': 'PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo',
8
+ 'length': 5410,
9
+ 'thumbnail_url': 'https://i.ytimg.com/vi/nXJBccSwtB8/hq720.jpg',
10
+ 'views': 138628,
11
+ 'episode_url': 'https://www.youtube.com/watch?v=nXJBccSwtB8&list=PL8qcvQ7Byc3OJ02hbWJbHWePh4XEg3cvo',
12
+ 'guest': 'Ian Bremmer',
13
+ 'summary': "In this episode, Ian Bremmer discusses the rise of big tech as a third superpower and the potential dangers and opportunities it presents. He highlights the immense power held by tech companies in shaping society, the economy, and national security, emphasizing their sovereignty over the digital world. Bremmer expresses concerns about the growing influence of AI and its potential to outstrip government regulation, leading to a reality where tech companies wield significant power over individuals. He also delves into the risks associated with AI proliferation, including the potential for non-governments to control and misuse the technology, exacerbating social inequalities and disinformation. Bremmer emphasizes the need to address negative externalities and regulate AI to mitigate its adverse impacts. Additionally, he discusses the implications of AI on job displacement and social discontent, particularly for marginalized communities. The conversation delves into the breakdown of truth in the digital age, driven by algorithmic sorting and micro-targeting, leading to fragmented echo chambers and the erosion of consensus on facts. Both Bremmer and the host explore the challenges of navigating truth in a polarized and algorithmically driven information landscape, highlighting the need for critical thinking and a focus on human flourishing as a guiding principle in the face of AI's transformative impact.",
14
+ 'content': "You said these are dangerous times. The world order is shifting before our eyes"}
15
+
16
+ chunk_size = 256
17
+ chunk_overlap = 0
18
+ encoding = tiktoken.encoding_for_model('gpt-3.5-turbo-0613')
19
+ gpt35_txt_splitter = SentenceSplitter(chunk_size=chunk_size, tokenizer=encoding.encode, chunk_overlap=chunk_overlap)
20
+
21
+
22
+ gpt35_txt_splitter(d['content'])
23
+
prompt_templates_luis.py DELETED
@@ -1,63 +0,0 @@
1
- question_answering_system = '''
2
- You are the host of the show Impact Theory, and your name is Tom Bilyeu. The description of your show is as follows:
3
- If you’re looking to thrive in uncertain times, achieve unprecedented goals, and improve the most meaningful aspects of your life, then Impact Theory is the show for you. Hosted by Tom Bilyeu, a voracious learner and hyper-successful entrepreneur, the show investigates and analyzes the most useful topics with the world’s most sought-after guests.
4
- Bilyeu attacks each episode with a clear desire to further evolve the holistic skillset that allowed him to co-found the billion-dollar company Quest Nutrition, generate over half a billion organic views on his content, build a thriving marriage of over 20 years, and quantifiably improve the lives of over 10,000 people through his school, Impact Theory University.
5
- Bilyeu’s insatiable hunger for knowledge gives the show urgency, relevance, and depth while leaving listeners with the knowledge, tools, and empowerment to take control of their lives and develop true personal power.
6
- '''
7
-
8
- question_answering_prompt_single = '''
9
- Use the below context enclosed in triple back ticks to answer the question. If the context does not provide enough information to answer the question, then use any knowledge you have to answer the question.\n
10
- ```{context}```\n
11
- Question:\n
12
- {question}.\n
13
- Answer:
14
- '''
15
-
16
- question_answering_prompt_series = '''
17
- Your task is to synthesize and reason over a series of transcripts of an interview between Tom Bilyeu and his guest(s).
18
- After your synthesis, use the series of transcripts to answer the below question. The series will be in the following format:\n
19
- ```
20
- Show Summary: <summary>
21
- Show Guest: <guest>
22
- Transcript: <transcript>
23
- ```\n\n
24
- Start Series:
25
- ```
26
- {series}
27
- ```
28
- Question:\n
29
- {question}\n
30
- Answer the question and provide reasoning if necessary to explain the answer.\n
31
- If the context does not provide enough information to answer the question, then \n
32
- state that you cannot answer the question with the provided context.\n
33
-
34
- Answer:
35
- '''
36
-
37
- context_block = '''
38
- Show Summary: {summary}
39
- Show Guest: {guest}
40
- Transcript: {transcript}
41
- '''
42
-
43
- qa_generation_prompt = '''
44
- Impact Theory episode summary and episode guest are below:
45
-
46
- ---------------------
47
- Summary: {summary}
48
- ---------------------
49
- Guest: {guest}
50
- ---------------------
51
- Given the Summary and Guest of the episode as context \
52
- use the following randomly selected transcript section \
53
- of the episode and not prior knowledge, generate questions that can \
54
- be answered by the transcript section:
55
-
56
- ---------------------
57
- Transcript: {transcript}
58
- ---------------------
59
-
60
- Your task is to create {num_questions_per_chunk} questions that can \
61
- only be answered given the previous context and transcript details. \
62
- The question should randomly start with How, Why, or What.
63
- '''