|
import logging, os, sys |
|
|
|
from llama_hub.youtube_transcript import YoutubeTranscriptReader |
|
from llama_index import download_loader, PromptTemplate |
|
from llama_index.indices.vector_store.base import VectorStoreIndex |
|
from llama_index.storage.storage_context import StorageContext |
|
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch |
|
|
|
from pathlib import Path |
|
from pymongo import MongoClient |
|
|
|
PDF_URL = "https://arxiv.org/pdf/2303.08774.pdf" |
|
WEB_URL = "https://openai.com/research/gpt-4" |
|
YOUTUBE_URL_1 = "https://www.youtube.com/watch?v=--khbXchTeE" |
|
YOUTUBE_URL_2 = "https://www.youtube.com/watch?v=hdhZwyf24mE" |
|
|
|
MONGODB_ATLAS_CLUSTER_URI = os.environ["MONGODB_ATLAS_CLUSTER_URI"] |
|
MONGODB_DB_NAME = "llamaindex_db" |
|
MONGODB_COLLECTION_NAME = "gpt-4" |
|
MONGODB_INDEX_NAME = "default" |
|
|
|
logging.basicConfig(stream = sys.stdout, level = logging.INFO) |
|
logging.getLogger().addHandler(logging.StreamHandler(stream = sys.stdout)) |
|
|
|
def load_documents(): |
|
docs = [] |
|
|
|
|
|
PDFReader = download_loader("PDFReader") |
|
loader = PDFReader() |
|
out_dir = Path("data") |
|
|
|
if not out_dir.exists(): |
|
os.makedirs(out_dir) |
|
|
|
out_path = out_dir / "gpt-4.pdf" |
|
|
|
if not out_path.exists(): |
|
r = requests.get(PDF_URL) |
|
with open(out_path, "wb") as f: |
|
f.write(r.content) |
|
|
|
docs.extend(loader.load_data(file = Path(out_path))) |
|
|
|
|
|
|
|
SimpleWebPageReader = download_loader("SimpleWebPageReader") |
|
loader = SimpleWebPageReader() |
|
docs.extend(loader.load_data(urls = [WEB_URL])) |
|
|
|
|
|
|
|
loader = YoutubeTranscriptReader() |
|
docs.extend(loader.load_data(ytlinks = [YOUTUBE_URL_1, |
|
YOUTUBE_URL_2])) |
|
|
|
|
|
return docs |
|
|
|
def store_documents(config, docs): |
|
storage_context = StorageContext.from_defaults( |
|
vector_store = get_vector_store()) |
|
|
|
VectorStoreIndex.from_documents( |
|
docs, |
|
storage_context = storage_context |
|
) |
|
|
|
def get_vector_store(): |
|
return MongoDBAtlasVectorSearch( |
|
MongoClient(MONGODB_ATLAS_CLUSTER_URI), |
|
db_name = MONGODB_DB_NAME, |
|
collection_name = MONGODB_COLLECTION_NAME, |
|
index_name = MONGODB_INDEX_NAME |
|
) |
|
|
|
def rag_ingestion_llamaindex(config): |
|
docs = load_documents() |
|
|
|
store_documents(config, docs) |
|
|
|
def rag_retrieval(config, prompt): |
|
index = VectorStoreIndex.from_vector_store( |
|
vector_store = get_vector_store()) |
|
|
|
query_engine = index.as_query_engine( |
|
similarity_top_k = config["k"] |
|
) |
|
|
|
return query_engine.query(prompt) |