RAG / rag.py
abhinavshrivastva's picture
Update rag.py
c35a477 verified
raw
history blame contribute delete
No virus
3.9 kB
# -*- coding: utf-8 -*-
"""RAG
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/18JQq2-GCmrrwAk9UuvqqeVrrnB4raKZt
"""
# !pip install -q pypdf
# !pip install torch
# !pip install -q transformers
# !pip -q install sentence-transformers
# !pip install -q llama-index
# !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
# !pip install llama-index-embeddings-huggingface
# !pip install llama-index-llms-llama-cpp
# !pip install cuda
import torch
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
messages_to_prompt,
completion_to_prompt,
)
llm = LlamaCPP(
# You can pass in the URL to a GGML model to download it automatically
# model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf',
model_url='https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_M.gguf',
# optionally, you can set the path to a pre-downloaded model instead of model_url
model_path=None,
temperature=0.1,
max_new_tokens=256,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=4096,
# kwargs to pass to __call__()
generate_kwargs={},
# kwargs to pass to __init__()
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": -1},
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)
from llama_index.core import SimpleDirectoryReader
from llama_index.core import Document
documents = SimpleDirectoryReader(
input_files = ["/content/Mindcase Data.pdf"]
).load_data()
documents = Document(text = "\n\n".join([doc.text for doc in documents]))
import os
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core import VectorStoreIndex, ServiceContext, load_index_from_storage
def get_build_index(documents,llm,embed_model="local:BAAI/bge-small-en-v1.5",sentence_window_size=3,save_dir="./vector_store/index"):
node_parser = SentenceWindowNodeParser(
window_size = sentence_window_size,
window_metadata_key = "window",
original_text_metadata_key = "original_text"
)
sentence_context = ServiceContext.from_defaults(
llm = llm,
embed_model= embed_model,
node_parser = node_parser,
)
if not os.path.exists(save_dir):
# create and load the index
index = VectorStoreIndex.from_documents(
[documents], service_context=sentence_context
)
index.storage_context.persist(persist_dir=save_dir)
else:
# load the existing index
index = load_index_from_storage(
StorageContext.from_defaults(persist_dir=save_dir),
service_context=sentence_context,
)
return index
# get the vector index
vector_index = get_build_index(documents=documents, llm=llm, embed_model="local:BAAI/bge-small-en-v1.5", sentence_window_size=3, save_dir="./vector_store/index")
from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank
def get_query_engine(sentence_index, similarity_top_k=6, rerank_top_n=2):
postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
rerank = SentenceTransformerRerank(
top_n=rerank_top_n, model="BAAI/bge-reranker-base"
)
engine = sentence_index.as_query_engine(
similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
)
return engine
query_engine = get_query_engine(sentence_index=vector_index, similarity_top_k=6, rerank_top_n=2)
def query(input):
return query_engine(input)