bstraehle commited on
Commit
55274da
1 Parent(s): b61c590

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -19
app.py CHANGED
@@ -3,10 +3,11 @@ import openai, os
3
 
4
  from langchain.chains import LLMChain, RetrievalQA
5
  from langchain.chat_models import ChatOpenAI
6
- from langchain.document_loaders import PyPDFLoader
7
  from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
8
  from langchain.document_loaders.generic import GenericLoader
9
  from langchain.document_loaders.parsers import OpenAIWhisperParser
 
10
  from langchain.embeddings.openai import OpenAIEmbeddings
11
  from langchain.prompts import PromptTemplate
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
@@ -31,15 +32,11 @@ RAG_CHAIN_PROMPT = PromptTemplate(input_variables = ["context", "question"],
31
  CHROMA_DIR = "/data/chroma"
32
  YOUTUBE_DIR = "/data/youtube"
33
 
 
 
34
  YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
35
  YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
36
  YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
37
- YOUTUBE_URL_4 = "https://www.youtube.com/watch?v=kiHpqXNCPj8"
38
- YOUTUBE_URL_5 = "https://www.youtube.com/shorts/3x95mw35dJY"
39
- YOUTUBE_URL_6 = "https://www.youtube.com/shorts/zg-DS23wq0c"
40
- YOUTUBE_URL_7 = "https://www.youtube.com/shorts/cS4fyhKZ8bQ"
41
-
42
- PDF_URL = "https://arxiv.org/pdf/2303.08774.pdf"
43
 
44
  MODEL_NAME = "gpt-4"
45
 
@@ -48,24 +45,25 @@ def invoke(openai_api_key, use_rag, prompt):
48
  openai_api_key = openai_api_key,
49
  temperature = 0)
50
  if (use_rag):
51
- # Document loading, splitting, and storage
52
  docs = []
53
- #loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,
54
- # YOUTUBE_URL_2,
55
- # YOUTUBE_URL_3,
56
- # YOUTUBE_URL_4,
57
- # YOUTUBE_URL_5,
58
- # YOUTUBE_URL_6,
59
- # YOUTUBE_URL_7], YOUTUBE_DIR),
60
- # OpenAIWhisperParser())
61
- #docs = loader.load()
62
- ###docs.extend(loader.load())
63
  loader = PyPDFLoader(PDF_URL)
64
  docs.extend(loader.load())
65
- #
 
 
 
 
 
 
 
 
 
66
  text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150,
67
  chunk_size = 1500)
68
  splits = text_splitter.split_documents(docs)
 
69
  vector_db = Chroma.from_documents(documents = splits,
70
  embedding = OpenAIEmbeddings(disallowed_special = ()),
71
  persist_directory = CHROMA_DIR)
 
3
 
4
  from langchain.chains import LLMChain, RetrievalQA
5
  from langchain.chat_models import ChatOpenAI
6
+ from langchain.document_loaders import PyPDFLoader, WebBaseLoader
7
  from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
8
  from langchain.document_loaders.generic import GenericLoader
9
  from langchain.document_loaders.parsers import OpenAIWhisperParser
10
+
11
  from langchain.embeddings.openai import OpenAIEmbeddings
12
  from langchain.prompts import PromptTemplate
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
32
  CHROMA_DIR = "/data/chroma"
33
  YOUTUBE_DIR = "/data/youtube"
34
 
35
+ PDF_URL = "https://arxiv.org/pdf/2303.08774.pdf"
36
+ WEB_URL = "https://openai.com/research/gpt-4"
37
  YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
38
  YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
39
  YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
 
 
 
 
 
 
40
 
41
  MODEL_NAME = "gpt-4"
42
 
 
45
  openai_api_key = openai_api_key,
46
  temperature = 0)
47
  if (use_rag):
48
+ # Document loading
49
  docs = []
50
+ # Load PDF
 
 
 
 
 
 
 
 
 
51
  loader = PyPDFLoader(PDF_URL)
52
  docs.extend(loader.load())
53
+ # Load Web
54
+ loader = WebBaseLoader(WEB_URL)
55
+ docs.extend(loader.load())
56
+ # Load YouTube
57
+ loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,
58
+ YOUTUBE_URL_2,
59
+ YOUTUBE_URL_3], YOUTUBE_DIR),
60
+ OpenAIWhisperParser())
61
+ docs.extend(loader.load())
62
+ # Document splitting
63
  text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150,
64
  chunk_size = 1500)
65
  splits = text_splitter.split_documents(docs)
66
+ # Document storage
67
  vector_db = Chroma.from_documents(documents = splits,
68
  embedding = OpenAIEmbeddings(disallowed_special = ()),
69
  persist_directory = CHROMA_DIR)