bstraehle commited on
Commit
994b8cd
1 Parent(s): 4f4bc85

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -6
app.py CHANGED
@@ -3,6 +3,7 @@ import openai, os
3
 
4
  from langchain.chains import LLMChain, RetrievalQA
5
  from langchain.chat_models import ChatOpenAI
 
6
  from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
7
  from langchain.document_loaders.generic import GenericLoader
8
  from langchain.document_loaders.parsers import OpenAIWhisperParser
@@ -38,6 +39,8 @@ YOUTUBE_URL_5 = "https://www.youtube.com/shorts/3x95mw35dJY"
38
  YOUTUBE_URL_6 = "https://www.youtube.com/shorts/zg-DS23wq0c"
39
  YOUTUBE_URL_7 = "https://www.youtube.com/shorts/cS4fyhKZ8bQ"
40
 
 
 
41
  MODEL_NAME = "gpt-4"
42
 
43
  def invoke(openai_api_key, use_rag, prompt):
@@ -46,6 +49,7 @@ def invoke(openai_api_key, use_rag, prompt):
46
  temperature = 0)
47
  if (use_rag):
48
  # Document loading, splitting, and storage
 
49
  #loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,
50
  # YOUTUBE_URL_2,
51
  # YOUTUBE_URL_3,
@@ -55,12 +59,16 @@ def invoke(openai_api_key, use_rag, prompt):
55
  # YOUTUBE_URL_7], YOUTUBE_DIR),
56
  # OpenAIWhisperParser())
57
  #docs = loader.load()
58
- #text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150,
59
- # chunk_size = 1500)
60
- #splits = text_splitter.split_documents(docs)
61
- #vector_db = Chroma.from_documents(documents = splits,
62
- # embedding = OpenAIEmbeddings(),
63
- # persist_directory = CHROMA_DIR)
 
 
 
 
64
  # Document retrieval
65
  vector_db = Chroma(embedding_function = OpenAIEmbeddings(),
66
  persist_directory = CHROMA_DIR)
 
3
 
4
  from langchain.chains import LLMChain, RetrievalQA
5
  from langchain.chat_models import ChatOpenAI
6
+ from langchain.document_loaders import PyPDFLoader
7
  from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
8
  from langchain.document_loaders.generic import GenericLoader
9
  from langchain.document_loaders.parsers import OpenAIWhisperParser
 
39
  YOUTUBE_URL_6 = "https://www.youtube.com/shorts/zg-DS23wq0c"
40
  YOUTUBE_URL_7 = "https://www.youtube.com/shorts/cS4fyhKZ8bQ"
41
 
42
+ PDF_URL = "https://arxiv.org/pdf/2303.08774.pdf"
43
+
44
  MODEL_NAME = "gpt-4"
45
 
46
  def invoke(openai_api_key, use_rag, prompt):
 
49
  temperature = 0)
50
  if (use_rag):
51
  # Document loading, splitting, and storage
52
+ docs = []
53
  #loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,
54
  # YOUTUBE_URL_2,
55
  # YOUTUBE_URL_3,
 
59
  # YOUTUBE_URL_7], YOUTUBE_DIR),
60
  # OpenAIWhisperParser())
61
  #docs = loader.load()
62
+ ###docs.extend(loader.load())
63
+ loader = PyPDFLoader(PDF_URL)
64
+ docs.extend(loader.load())
65
+ #
66
+ text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150,
67
+ chunk_size = 1500)
68
+ splits = text_splitter.split_documents(docs)
69
+ vector_db = Chroma.from_documents(documents = splits,
70
+ embedding = OpenAIEmbeddings(),
71
+ persist_directory = CHROMA_DIR)
72
  # Document retrieval
73
  vector_db = Chroma(embedding_function = OpenAIEmbeddings(),
74
  persist_directory = CHROMA_DIR)