Spaces:
Runtime error
Runtime error
File size: 2,892 Bytes
0809507 9a2650e 0809507 38bc9e2 0809507 9a2650e ef2a3f4 0809507 38bc9e2 9a2650e 0809507 9a2650e 3b7cf58 0809507 1230ae3 0809507 1230ae3 0809507 ef2a3f4 3b7cf58 ef2a3f4 0809507 3b7cf58 9a2650e 0809507 9a2650e 0809507 9a2650e 0809507 9a2650e 0809507 9a2650e 0809507 9a2650e 0809507 9a2650e 402c1d3 38bc9e2 0809507 9a2650e 0809507 9a2650e 402c1d3 9a2650e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import os
import pickle
from json import dumps, loads
import numpy as np
import openai
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import HfFileSystem
from llama_index import (
Document,
GPTVectorStoreIndex,
LLMPredictor,
PromptHelper,
ServiceContext,
StorageContext,
load_index_from_storage,
)
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from utils.customLLM import CustomLLM
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
fs = HfFileSystem()
# get model
# model_name = "bigscience/bloom-560m"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, config='T5Config')
# define prompt helper
# set maximum input size
context_window = 2048
# set number of output tokens
num_output = 525
# set maximum chunk overlap
chunk_overlap_ratio = 0.2
prompt_helper = PromptHelper(context_window, num_output, chunk_overlap_ratio)
# create a pipeline
# pl = pipeline(
# model=model,
# tokenizer=tokenizer,
# task="text-generation",
# # device=0, # GPU device number
# # max_length=512,
# do_sample=True,
# top_p=0.95,
# top_k=50,
# temperature=0.7
# )
# define llm
llm_predictor = LLMPredictor(llm=CustomLLM())
service_context = ServiceContext.from_defaults(
llm_predictor=llm_predictor, prompt_helper=prompt_helper
)
def prepare_data(file_path: str):
df = pd.read_json(file_path)
df = df.replace(to_replace="", value=np.nan).dropna(axis=0) # remove null values
parsed = loads(df.to_json(orient="records"))
documents = []
for item in parsed:
document = Document(
item["paragraphText"],
item["_id"]["$oid"],
extra_info={
"chapter": item["chapter"],
"article": item["article"],
"title": item["title"],
},
)
documents.append(document)
return documents
def initialize_index(index_name):
file_path = f"./vectorStores/{index_name}"
if os.path.exists(file_path):
# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir=file_path)
# local load index access
index = load_index_from_storage(storage_context)
# huggingface repo load access
# with fs.open(file_path, "r") as file:
# index = pickle.loads(file.readlines())
return index
else:
documents = prepare_data(r"./assets/regItems.json")
index = GPTVectorStoreIndex.from_documents(
documents, service_context=service_context
)
# local write access
index.storage_context.persist(file_path)
# huggingface repo write access
# with fs.open(file_path, "w") as file:
# file.write(pickle.dumps(index))
return index
|