|
import os |
|
from json import dumps, loads |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from dotenv import load_dotenv |
|
from llama_index import (Document, GPTVectorStoreIndex, LLMPredictor, |
|
PromptHelper, ServiceContext, StorageContext, |
|
load_index_from_storage) |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
|
|
from utils.customLLM import CustomLLM |
|
|
|
load_dotenv() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
context_window = 2048 |
|
|
|
num_output = 525 |
|
|
|
chunk_overlap_ratio = 0.2 |
|
prompt_helper = PromptHelper(context_window, num_output, chunk_overlap_ratio) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llm_predictor = LLMPredictor(llm=CustomLLM()) |
|
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper) |
|
|
|
def prepare_data(file_path:str): |
|
df = pd.read_json(file_path) |
|
df = df.replace(to_replace="", value=np.nan).dropna(axis=0) |
|
|
|
parsed = loads(df.to_json(orient="records")) |
|
|
|
documents = [] |
|
for item in parsed: |
|
document = Document(item['paragraphText'], |
|
item['_id']['$oid'], |
|
extra_info={"chapter": item['chapter'], |
|
"article": item['article'], |
|
"title": item['title']}) |
|
documents.append(document) |
|
|
|
return documents |
|
|
|
def initialize_index(index_name): |
|
file_path = f"./vectorStores/{index_name}" |
|
if os.path.exists(file_path): |
|
|
|
storage_context = StorageContext.from_defaults(persist_dir=file_path) |
|
|
|
index = load_index_from_storage(storage_context) |
|
return GPTVectorStoreIndex.load_from_disk(file_path) |
|
else: |
|
documents = prepare_data(r"./assets/regItems.json") |
|
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) |
|
index.storage_context.persist(file_path) |
|
return index |