bstraehle commited on
Commit
2db1016
1 Parent(s): 6724508

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -19
app.py CHANGED
@@ -30,9 +30,13 @@ RAG_CHAIN_PROMPT = PromptTemplate(input_variables = ["context", "question"],
30
  CHROMA_DIR = "/data/chroma"
31
  YOUTUBE_DIR = "/data/youtube"
32
 
33
- YOUTUBE_URL_01 = "https://www.youtube.com/watch?v=Iy1IpvcJH7I&list=PL2yQDdvlhXf9XsB2W76_seM6dJxcE2Pdc&index=1"
34
- YOUTUBE_URL_02 = "https://www.youtube.com/watch?v=Iy1IpvcJH7I&list=PL2yQDdvlhXf9XsB2W76_seM6dJxcE2Pdc&index=2"
35
- YOUTUBE_URL_03 = "https://www.youtube.com/watch?v=Iy1IpvcJH7I&list=PL2yQDdvlhXf9XsB2W76_seM6dJxcE2Pdc&index=3"
 
 
 
 
36
 
37
  MODEL_NAME = "gpt-4"
38
 
@@ -41,18 +45,21 @@ def invoke(openai_api_key, use_rag, prompt):
41
  openai_api_key = openai_api_key,
42
  temperature = 0)
43
  if (use_rag):
44
- # Document loading, splitting, and storage
45
- #loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_01,
46
- # YOUTUBE_URL_02,
47
- # YOUTUBE_URL_03], YOUTUBE_DIR),
48
- # OpenAIWhisperParser())
49
- #docs = loader.load()
50
- #text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150,
51
- # chunk_size = 1500)
52
- #splits = text_splitter.split_documents(docs)
53
- #vector_db = Chroma.from_documents(documents = splits,
54
- # embedding = OpenAIEmbeddings(),
55
- # persist_directory = CHROMA_DIR)
 
 
 
56
  # Document retrieval
57
  vector_db = Chroma(embedding_function = OpenAIEmbeddings(),
58
  persist_directory = CHROMA_DIR)
@@ -68,10 +75,8 @@ def invoke(openai_api_key, use_rag, prompt):
68
  return result
69
 
70
  description = """<strong>Overview:</strong> The app demonstrates how to use a Large Language Model (LLM) with Retrieval Augmented Generation (RAG) on external data
71
- (in this case the <a href='https://www.youtube.com/playlist?list=PL2yQDdvlhXf9XsB2W76_seM6dJxcE2Pdc'>AWS re:Invent 2022 - AI/ML YouTube playlist</a>,
72
- but it could be PDFs, URLs, or other <a href='https://raw.githubusercontent.com/bstraehle/ai-ml-dl/c38b224c196fc984aab6b6cc6bdc666f8f4fbcff/langchain/document-loaders.png'>data sources</a>).\n\n
73
- <strong>Instructions:</strong> Enter an OpenAI API key and perform LLM use cases (semantic search, sentiment analysis, summarization, translation, etc.) on
74
- a <a href='https://www.youtube.com/watch?v=--khbXchTeE'>short video of GPT-4</a>.
75
  <ul style="list-style-type:square;">
76
  <li>Set "Retrieval Augmented Generation" to "<strong>False</strong>" and submit prompt "explain gpt-4". The LLM <strong>without</strong> RAG does not know the answer.</li>
77
  <li>Set "Retrieval Augmented Generation" to "<strong>True</strong>" and submit prompt "explain gpt-4". The LLM <strong>with</strong> RAG knows the answer.</li>
 
30
  CHROMA_DIR = "/data/chroma"
31
  YOUTUBE_DIR = "/data/youtube"
32
 
33
+ YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
34
+ YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
35
+ YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
36
+
37
+ YOUTUBE_URL_4 = "https://www.youtube.com/shorts/3x95mw35dJY"
38
+ YOUTUBE_URL_5 = "https://www.youtube.com/shorts/zg-DS23wq0c"
39
+ YOUTUBE_URL_6 = "https://www.youtube.com/shorts/cS4fyhKZ8bQ"
40
 
41
  MODEL_NAME = "gpt-4"
42
 
 
45
  openai_api_key = openai_api_key,
46
  temperature = 0)
47
  if (use_rag):
48
+ Document loading, splitting, and storage
49
+ loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_01,
50
+ YOUTUBE_URL_02,
51
+ YOUTUBE_URL_03,
52
+ YOUTUBE_URL_04,
53
+ YOUTUBE_URL_05,
54
+ YOUTUBE_URL_06], YOUTUBE_DIR),
55
+ OpenAIWhisperParser())
56
+ docs = loader.load()
57
+ text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150,
58
+ chunk_size = 1500)
59
+ splits = text_splitter.split_documents(docs)
60
+ vector_db = Chroma.from_documents(documents = splits,
61
+ embedding = OpenAIEmbeddings(),
62
+ persist_directory = CHROMA_DIR)
63
  # Document retrieval
64
  vector_db = Chroma(embedding_function = OpenAIEmbeddings(),
65
  persist_directory = CHROMA_DIR)
 
75
  return result
76
 
77
  description = """<strong>Overview:</strong> The app demonstrates how to use a Large Language Model (LLM) with Retrieval Augmented Generation (RAG) on external data
78
+ (in this case YouTube videos about GPT-4, but it could be PDFs, URLs, or other <a href='https://raw.githubusercontent.com/bstraehle/ai-ml-dl/c38b224c196fc984aab6b6cc6bdc666f8f4fbcff/langchain/document-loaders.png'>data sources</a>).\n\n
79
+ <strong>Instructions:</strong> Enter an OpenAI API key and perform LLM use cases (semantic search, sentiment analysis, summarization, translation, etc.)
 
 
80
  <ul style="list-style-type:square;">
81
  <li>Set "Retrieval Augmented Generation" to "<strong>False</strong>" and submit prompt "explain gpt-4". The LLM <strong>without</strong> RAG does not know the answer.</li>
82
  <li>Set "Retrieval Augmented Generation" to "<strong>True</strong>" and submit prompt "explain gpt-4". The LLM <strong>with</strong> RAG knows the answer.</li>