File size: 3,073 Bytes
fb72340 5f04412 50dc063 5f04412 a0b5dc6 5f04412 c947c47 1342013 5f04412 ebaeae5 e946a29 0ddb69a 5f04412 0ddb69a 5f04412 0ddb69a 5f04412 0ddb69a 5f04412 0ddb69a 3940450 0ddb69a 5f04412 0ddb69a 5f04412 0ddb69a 3940450 0ddb69a 5f04412 0ddb69a 3940450 0ddb69a 5f04412 0ddb69a 5f04412 e946a29 0ddb69a a0b5dc6 5f04412 0ddb69a c11e160 0ddb69a 5f04412 e946a29 0ddb69a 3940450 008f2f7 3940450 0ddb69a 5f04412 e946a29 ff3ca13 5f04412 ff3ca13 5f04412 a0b5dc6 5ad33e1 a0b5dc6 e946a29 a0b5dc6 0ddb69a db1e09d 5f04412 0ddb69a a0b5dc6 0ddb69a 5f04412 0ddb69a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import os, requests
from llama_hub.youtube_transcript import YoutubeTranscriptReader
from llama_index import download_loader, PromptTemplate, ServiceContext
from llama_index.indices.vector_store.base import VectorStoreIndex
from llama_index.llms import OpenAI
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from pathlib import Path
from pymongo import MongoClient
from rag_base import BaseRAG
class LlamaIndexRAG(BaseRAG):
MONGODB_DB_NAME = "llamaindex_db"
def load_documents(self):
docs = []
# PDF
PDFReader = download_loader("PDFReader")
loader = PDFReader()
out_dir = Path("data")
if not out_dir.exists():
os.makedirs(out_dir)
out_path = out_dir / "gpt-4.pdf"
if not out_path.exists():
r = requests.get(self.PDF_URL)
with open(out_path, "wb") as f:
f.write(r.content)
docs.extend(loader.load_data(file = Path(out_path)))
#print("docs = " + str(len(docs)))
# Web
SimpleWebPageReader = download_loader("SimpleWebPageReader")
loader = SimpleWebPageReader()
docs.extend(loader.load_data(urls = [self.WEB_URL]))
#print("docs = " + str(len(docs)))
# YouTube
loader = YoutubeTranscriptReader()
docs.extend(loader.load_data(ytlinks = [self.YOUTUBE_URL_1,
self.YOUTUBE_URL_2]))
#print("docs = " + str(len(docs)))
return docs
def store_documents(self, config, docs):
storage_context = StorageContext.from_defaults(
vector_store = self.get_vector_store()
)
VectorStoreIndex.from_documents(
chunk_overlap = config["chunk_overlap"],
chunk_size = config["chunk_size"],
documents = docs,
#embedding = x,
storage_context = storage_context
)
def get_vector_store(self):
return MongoDBAtlasVectorSearch(
MongoClient(self.MONGODB_ATLAS_CLUSTER_URI),
db_name = self.MONGODB_DB_NAME,
collection_name = self.MONGODB_COLLECTION_NAME,
index_name = self.MONGODB_INDEX_NAME
)
def ingestion(self, config):
docs = self.load_documents()
self.store_documents(config, docs)
def get_llm(self, config):
return OpenAI(
model = config["model_name"],
temperature = config["temperature"]
)
def retrieval(self, config, prompt):
service_context = ServiceContext.from_defaults(
llm = self.get_llm(config)
)
index = VectorStoreIndex.from_vector_store(
vector_store = self.get_vector_store()
)
query_engine = index.as_query_engine(
service_context = service_context,
similarity_top_k = config["k"]
)
return query_engine.query(prompt) |