File size: 3,073 Bytes
fb72340
5f04412
 
50dc063
5f04412
a0b5dc6
5f04412
 
 
 
c947c47
1342013
5f04412
ebaeae5
 
 
e946a29
0ddb69a
5f04412
0ddb69a
 
 
 
5f04412
0ddb69a
 
5f04412
0ddb69a
5f04412
0ddb69a
3940450
0ddb69a
 
5f04412
0ddb69a
 
5f04412
0ddb69a
 
 
3940450
0ddb69a
5f04412
0ddb69a
 
3940450
 
0ddb69a
5f04412
0ddb69a
5f04412
e946a29
0ddb69a
a0b5dc6
 
5f04412
0ddb69a
c11e160
 
 
 
0ddb69a
 
5f04412
e946a29
0ddb69a
3940450
008f2f7
3940450
 
0ddb69a
5f04412
e946a29
ff3ca13
5f04412
ff3ca13
5f04412
a0b5dc6
 
 
5ad33e1
 
a0b5dc6
e946a29
a0b5dc6
 
 
 
0ddb69a
db1e09d
 
5f04412
0ddb69a
a0b5dc6
0ddb69a
 
5f04412
0ddb69a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os, requests

from llama_hub.youtube_transcript import YoutubeTranscriptReader
from llama_index import download_loader, PromptTemplate, ServiceContext
from llama_index.indices.vector_store.base import VectorStoreIndex
from llama_index.llms import OpenAI
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

from pathlib import Path
from pymongo import MongoClient
from rag_base import BaseRAG

class LlamaIndexRAG(BaseRAG):
    MONGODB_DB_NAME = "llamaindex_db"
    
    def load_documents(self):
        docs = []
    
        # PDF
        PDFReader = download_loader("PDFReader")
        loader = PDFReader()
        out_dir = Path("data")
    
        if not out_dir.exists():
            os.makedirs(out_dir)
    
        out_path = out_dir / "gpt-4.pdf"
    
        if not out_path.exists():
            r = requests.get(self.PDF_URL)
            with open(out_path, "wb") as f:
                f.write(r.content)

        docs.extend(loader.load_data(file = Path(out_path)))
        #print("docs = " + str(len(docs)))
    
        # Web
        SimpleWebPageReader = download_loader("SimpleWebPageReader")
        loader = SimpleWebPageReader()
        docs.extend(loader.load_data(urls = [self.WEB_URL]))
        #print("docs = " + str(len(docs)))

        # YouTube
        loader = YoutubeTranscriptReader()
        docs.extend(loader.load_data(ytlinks = [self.YOUTUBE_URL_1,
                                                self.YOUTUBE_URL_2]))
        #print("docs = " + str(len(docs)))
    
        return docs

    def store_documents(self, config, docs):
        storage_context = StorageContext.from_defaults(
            vector_store = self.get_vector_store()
        )
    
        VectorStoreIndex.from_documents(
            chunk_overlap = config["chunk_overlap"],
            chunk_size = config["chunk_size"],
            documents = docs,
            #embedding = x,
            storage_context = storage_context
        )

    def get_vector_store(self):
        return MongoDBAtlasVectorSearch(
            MongoClient(self.MONGODB_ATLAS_CLUSTER_URI),
            db_name = self.MONGODB_DB_NAME,
            collection_name = self.MONGODB_COLLECTION_NAME,
            index_name = self.MONGODB_INDEX_NAME
        )

    def ingestion(self, config):
        docs = self.load_documents()
    
        self.store_documents(config, docs)

    def get_llm(self, config):
        return OpenAI(
            model = config["model_name"], 
            temperature = config["temperature"]
        )
        
    def retrieval(self, config, prompt):
        service_context = ServiceContext.from_defaults(
            llm = self.get_llm(config)
        )
        
        index = VectorStoreIndex.from_vector_store(
            vector_store = self.get_vector_store()
        )

        query_engine = index.as_query_engine(
            service_context = service_context,
            similarity_top_k = config["k"]
        )
 
        return query_engine.query(prompt)