Update app.py
Browse files
app.py
CHANGED
@@ -3,10 +3,11 @@ import openai, os
|
|
3 |
|
4 |
from langchain.chains import LLMChain, RetrievalQA
|
5 |
from langchain.chat_models import ChatOpenAI
|
6 |
-
from langchain.document_loaders import PyPDFLoader
|
7 |
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
|
8 |
from langchain.document_loaders.generic import GenericLoader
|
9 |
from langchain.document_loaders.parsers import OpenAIWhisperParser
|
|
|
10 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
11 |
from langchain.prompts import PromptTemplate
|
12 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
@@ -31,15 +32,11 @@ RAG_CHAIN_PROMPT = PromptTemplate(input_variables = ["context", "question"],
|
|
31 |
CHROMA_DIR = "/data/chroma"
|
32 |
YOUTUBE_DIR = "/data/youtube"
|
33 |
|
|
|
|
|
34 |
YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
|
35 |
YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
|
36 |
YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
|
37 |
-
YOUTUBE_URL_4 = "https://www.youtube.com/watch?v=kiHpqXNCPj8"
|
38 |
-
YOUTUBE_URL_5 = "https://www.youtube.com/shorts/3x95mw35dJY"
|
39 |
-
YOUTUBE_URL_6 = "https://www.youtube.com/shorts/zg-DS23wq0c"
|
40 |
-
YOUTUBE_URL_7 = "https://www.youtube.com/shorts/cS4fyhKZ8bQ"
|
41 |
-
|
42 |
-
PDF_URL = "https://arxiv.org/pdf/2303.08774.pdf"
|
43 |
|
44 |
MODEL_NAME = "gpt-4"
|
45 |
|
@@ -48,24 +45,25 @@ def invoke(openai_api_key, use_rag, prompt):
|
|
48 |
openai_api_key = openai_api_key,
|
49 |
temperature = 0)
|
50 |
if (use_rag):
|
51 |
-
# Document loading
|
52 |
docs = []
|
53 |
-
#
|
54 |
-
# YOUTUBE_URL_2,
|
55 |
-
# YOUTUBE_URL_3,
|
56 |
-
# YOUTUBE_URL_4,
|
57 |
-
# YOUTUBE_URL_5,
|
58 |
-
# YOUTUBE_URL_6,
|
59 |
-
# YOUTUBE_URL_7], YOUTUBE_DIR),
|
60 |
-
# OpenAIWhisperParser())
|
61 |
-
#docs = loader.load()
|
62 |
-
###docs.extend(loader.load())
|
63 |
loader = PyPDFLoader(PDF_URL)
|
64 |
docs.extend(loader.load())
|
65 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150,
|
67 |
chunk_size = 1500)
|
68 |
splits = text_splitter.split_documents(docs)
|
|
|
69 |
vector_db = Chroma.from_documents(documents = splits,
|
70 |
embedding = OpenAIEmbeddings(disallowed_special = ()),
|
71 |
persist_directory = CHROMA_DIR)
|
|
|
3 |
|
4 |
from langchain.chains import LLMChain, RetrievalQA
|
5 |
from langchain.chat_models import ChatOpenAI
|
6 |
+
from langchain.document_loaders import PyPDFLoader, WebBaseLoader
|
7 |
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
|
8 |
from langchain.document_loaders.generic import GenericLoader
|
9 |
from langchain.document_loaders.parsers import OpenAIWhisperParser
|
10 |
+
|
11 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
12 |
from langchain.prompts import PromptTemplate
|
13 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
32 |
CHROMA_DIR = "/data/chroma"
|
33 |
YOUTUBE_DIR = "/data/youtube"
|
34 |
|
35 |
+
PDF_URL = "https://arxiv.org/pdf/2303.08774.pdf"
|
36 |
+
WEB_URL = "https://openai.com/research/gpt-4"
|
37 |
YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE"
|
38 |
YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE"
|
39 |
YOUTUBE_URL_3 = "https://www.youtube.com/watch?v=vw-KWfKwvTQ"
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
MODEL_NAME = "gpt-4"
|
42 |
|
|
|
45 |
openai_api_key = openai_api_key,
|
46 |
temperature = 0)
|
47 |
if (use_rag):
|
48 |
+
# Document loading
|
49 |
docs = []
|
50 |
+
# Load PDF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
loader = PyPDFLoader(PDF_URL)
|
52 |
docs.extend(loader.load())
|
53 |
+
# Load Web
|
54 |
+
loader = WebBaseLoader(WEB_URL)
|
55 |
+
docs.extend(loader.load())
|
56 |
+
# Load YouTube
|
57 |
+
loader = GenericLoader(YoutubeAudioLoader([YOUTUBE_URL_1,
|
58 |
+
YOUTUBE_URL_2,
|
59 |
+
YOUTUBE_URL_3], YOUTUBE_DIR),
|
60 |
+
OpenAIWhisperParser())
|
61 |
+
docs.extend(loader.load())
|
62 |
+
# Document splitting
|
63 |
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap = 150,
|
64 |
chunk_size = 1500)
|
65 |
splits = text_splitter.split_documents(docs)
|
66 |
+
# Document storage
|
67 |
vector_db = Chroma.from_documents(documents = splits,
|
68 |
embedding = OpenAIEmbeddings(disallowed_special = ()),
|
69 |
persist_directory = CHROMA_DIR)
|