Spaces:

bstraehle
/

rag

Running

App Files Files Community

bstraehle commited on Oct 22, 2023

Commit

2db1016

1 Parent(s): 6724508

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -19

app.py CHANGED Viewed

@@ -30,9 +30,13 @@ RAG_CHAIN_PROMPT = PromptTemplate(input_variables = ["context", "question"],
 CHROMA_DIR  = "/data/chroma"
 YOUTUBE_DIR = "/data/youtube"
-YOUTUBE_URL_01 = "https://www.youtube.com/watch?v=Iy1IpvcJH7I&list=PL2yQDdvlhXf9XsB2W76_seM6dJxcE2Pdc&index=1"
-YOUTUBE_URL_02 = "https://www.youtube.com/watch?v=Iy1IpvcJH7I&list=PL2yQDdvlhXf9XsB2W76_seM6dJxcE2Pdc&index=2"
-YOUTUBE_URL_03 = "https://www.youtube.com/watch?v=Iy1IpvcJH7I&list=PL2yQDdvlhXf9XsB2W76_seM6dJxcE2Pdc&index=3"
 MODEL_NAME  = "gpt-4"
@@ -41,18 +45,21 @@ def invoke(openai_api_key, use_rag, prompt):
                      openai_api_key = openai_api_key,
                      temperature = 0)
     if (use_rag):
-        # Document loading, splitting, and storage
-        #loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_01,
-        #                                           YOUTUBE_URL_02,
-        #                                           YOUTUBE_URL_03], YOUTUBE_DIR),
-        #                       OpenAIWhisperParser())
-        #docs = loader.load()
-        #text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150,
-        #                                               chunk_size = 1500)
-        #splits = text_splitter.split_documents(docs)
-        #vector_db = Chroma.from_documents(documents = splits,
-        #                                  embedding = OpenAIEmbeddings(),
-        #                                  persist_directory = CHROMA_DIR)
         # Document retrieval
         vector_db = Chroma(embedding_function = OpenAIEmbeddings(),
                            persist_directory = CHROMA_DIR)
@@ -68,10 +75,8 @@ def invoke(openai_api_key, use_rag, prompt):
     return result
 description = """<strong>Overview:</strong> The app demonstrates how to use a Large Language Model (LLM) with Retrieval Augmented Generation (RAG) on external data
-                 (in this case the <a href='https://www.youtube.com/playlist?list=PL2yQDdvlhXf9XsB2W76_seM6dJxcE2Pdc'>AWS re:Invent 2022 - AI/ML YouTube playlist</a>,
-                 but it could be PDFs, URLs, or other <a href='https://raw.githubusercontent.com/bstraehle/ai-ml-dl/c38b224c196fc984aab6b6cc6bdc666f8f4fbcff/langchain/document-loaders.png'>data sources</a>).\n\n
-                 <strong>Instructions:</strong> Enter an OpenAI API key and perform LLM use cases (semantic search, sentiment analysis, summarization, translation, etc.) on
-                 a <a href='https://www.youtube.com/watch?v=--khbXchTeE'>short video of GPT-4</a>.
                  <ul style="list-style-type:square;">
                  <li>Set "Retrieval Augmented Generation" to "<strong>False</strong>" and submit prompt "explain gpt-4". The LLM <strong>without</strong> RAG does not know the answer.</li>
                  <li>Set "Retrieval Augmented Generation" to "<strong>True</strong>" and submit prompt "explain gpt-4". The LLM <strong>with</strong> RAG knows the answer.</li>

 CHROMA_DIR  = "/data/chroma"
 YOUTUBE_DIR = "/data/youtube"
+YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
+YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
+YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
+YOUTUBE_URL_4 = "https://www.youtube.com/shorts/3x95mw35dJY"
+YOUTUBE_URL_5 = "https://www.youtube.com/shorts/zg-DS23wq0c"
+YOUTUBE_URL_6 = "https://www.youtube.com/shorts/cS4fyhKZ8bQ"
 MODEL_NAME  = "gpt-4"
                      openai_api_key = openai_api_key,
                      temperature = 0)
     if (use_rag):
+         Document loading, splitting, and storage
+        loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_01,
+                                                   YOUTUBE_URL_02,
+                                                   YOUTUBE_URL_03,
+                                                   YOUTUBE_URL_04,
+                                                   YOUTUBE_URL_05,
+                                                   YOUTUBE_URL_06], YOUTUBE_DIR),
+                               OpenAIWhisperParser())
+        docs = loader.load()
+        text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150,
+                                                       chunk_size = 1500)
+        splits = text_splitter.split_documents(docs)
+        vector_db = Chroma.from_documents(documents = splits,
+                                          embedding = OpenAIEmbeddings(),
+                                          persist_directory = CHROMA_DIR)
         # Document retrieval
         vector_db = Chroma(embedding_function = OpenAIEmbeddings(),
                            persist_directory = CHROMA_DIR)
     return result
 description = """<strong>Overview:</strong> The app demonstrates how to use a Large Language Model (LLM) with Retrieval Augmented Generation (RAG) on external data
+                 (in this case YouTube videos about GPT-4, but it could be PDFs, URLs, or other <a href='https://raw.githubusercontent.com/bstraehle/ai-ml-dl/c38b224c196fc984aab6b6cc6bdc666f8f4fbcff/langchain/document-loaders.png'>data sources</a>).\n\n
+                 <strong>Instructions:</strong> Enter an OpenAI API key and perform LLM use cases (semantic search, sentiment analysis, summarization, translation, etc.)
                  <ul style="list-style-type:square;">
                  <li>Set "Retrieval Augmented Generation" to "<strong>False</strong>" and submit prompt "explain gpt-4". The LLM <strong>without</strong> RAG does not know the answer.</li>
                  <li>Set "Retrieval Augmented Generation" to "<strong>True</strong>" and submit prompt "explain gpt-4". The LLM <strong>with</strong> RAG knows the answer.</li>