|
import os |
|
import pickle |
|
from json import dumps, loads |
|
|
|
import numpy as np |
|
import openai |
|
import pandas as pd |
|
from dotenv import load_dotenv |
|
from huggingface_hub import HfFileSystem |
|
from llama_index import ( |
|
Document, |
|
GPTVectorStoreIndex, |
|
LLMPredictor, |
|
PromptHelper, |
|
ServiceContext, |
|
StorageContext, |
|
load_index_from_storage, |
|
) |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline |
|
|
|
from utils.customLLM import CustomLLM |
|
|
|
load_dotenv() |
|
openai.api_key = os.getenv("OPENAI_API_KEY") |
|
fs = HfFileSystem() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
context_window = 2048 |
|
|
|
num_output = 525 |
|
|
|
chunk_overlap_ratio = 0.2 |
|
prompt_helper = PromptHelper(context_window, num_output, chunk_overlap_ratio) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llm_predictor = LLMPredictor(llm=CustomLLM()) |
|
service_context = ServiceContext.from_defaults( |
|
llm_predictor=llm_predictor, prompt_helper=prompt_helper |
|
) |
|
|
|
|
|
def prepare_data(file_path: str): |
|
df = pd.read_json(file_path) |
|
df = df.replace(to_replace="", value=np.nan).dropna(axis=0) |
|
|
|
parsed = loads(df.to_json(orient="records")) |
|
|
|
documents = [] |
|
for item in parsed: |
|
document = Document( |
|
text=item["paragraphText"], |
|
doc_id=item["_id"]["$oid"], |
|
extra_info={ |
|
"chapter": item["chapter"], |
|
"article": item["article"], |
|
"title": item["title"], |
|
}, |
|
) |
|
documents.append(document) |
|
|
|
return documents |
|
|
|
|
|
def initialize_index(index_name): |
|
file_path = f"./vectorStores/{index_name}" |
|
if os.path.exists(file_path): |
|
|
|
storage_context = StorageContext.from_defaults(persist_dir=file_path) |
|
|
|
|
|
index = load_index_from_storage(storage_context) |
|
|
|
|
|
|
|
|
|
return index |
|
else: |
|
documents = prepare_data(r"./assets/regItems.json") |
|
index = GPTVectorStoreIndex.from_documents( |
|
documents, service_context=service_context |
|
) |
|
|
|
index.storage_context.persist(file_path) |
|
|
|
|
|
|
|
|
|
return index |
|
|