File size: 7,291 Bytes
d0d09f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 |
import os
from dotenv import load_dotenv, find_dotenv
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.readers.file import PyMuPDFReader
from llama_index.core.chat_engine import ContextChatEngine
from llama_index.core import KnowledgeGraphIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
def initialize_openai_creds():
"""Load environment variables and set API keys."""
dotenv_path = find_dotenv()
if dotenv_path == "":
print("No .env file found. Make sure the .env file is in the correct directory.")
else:
print(f".env file found at: {dotenv_path}")
load_dotenv(dotenv_path)
# General Azure OpenAI settings for gpt35 and gpt-4o-mini
general_creds = {
"api_key": os.getenv('AZURE_OPENAI_API_KEY'),
"api_version": os.getenv("AZURE_API_VERSION"),
"endpoint": os.getenv("AZURE_OPENAI_ENDPOINT"),
"temperature": 0, # Default temperature for models
"gpt35_deployment_name": os.getenv("AZURE_DEPLOYMENT_NAME"),
"gpt4o_mini_deployment_name": os.getenv("GPT4O_MINI_DEPLOYMENT_NAME")
}
# GPT-4o specific settings
gpt4o_creds = {
"api_key": os.getenv('GPT4O_API_KEY'),
"api_version": os.getenv("GPT4O_API_VERSION"),
"endpoint": os.getenv("GPT4O_AZURE_ENDPOINT"),
"deployment_name": os.getenv("GPT4O_DEPLOYMENT_NAME"),
"temperature": os.getenv("GPT4O_TEMPERATURE", 0) # Default temperature for GPT-4o
}
return general_creds, gpt4o_creds
def initialize_openai_creds():
"""Load environment variables and set API keys."""
dotenv_path = find_dotenv()
if dotenv_path == "":
print("No .env file found. Make sure the .env file is in the correct directory.")
else:
print(f".env file found at: {dotenv_path}")
load_dotenv(dotenv_path)
# GPT-3.5 Credentials
gpt35_creds = {
"api_key": os.getenv('AZURE_OPENAI_API_KEY_GPT35'),
"api_version": os.getenv("AZURE_API_VERSION"),
"endpoint": os.getenv("AZURE_OPENAI_ENDPOINT_GPT35"),
"temperature": 0, # Default temperature for models
"deployment_name": os.getenv("AZURE_DEPLOYMENT_NAME_GPT35")
}
# GPT-4o-mini Credentials (shares the same API key as GPT-3.5 but different deployment name and endpoint)
gpt4o_mini_creds = {
"api_key": os.getenv('AZURE_OPENAI_API_KEY_GPT4O_MINI'),
"api_version": os.getenv("AZURE_API_VERSION"),
"endpoint": os.getenv("AZURE_OPENAI_ENDPOINT_GPT4O_MINI"),
"temperature": 0, # Default temperature for models
"deployment_name": os.getenv("GPT4O_MINI_DEPLOYMENT_NAME")
}
# GPT-4o specific credentials
gpt4o_creds = {
"api_key": os.getenv('GPT4O_API_KEY'),
"api_version": os.getenv("GPT4O_API_VERSION"),
"endpoint": os.getenv("GPT4O_AZURE_ENDPOINT"),
"deployment_name": os.getenv("GPT4O_DEPLOYMENT_NAME"),
"temperature": os.getenv("GPT4O_TEMPERATURE", 0) # Default temperature for GPT-4o
}
return gpt35_creds, gpt4o_mini_creds, gpt4o_creds
def create_llm(model: str, gpt35_creds: dict, gpt4o_mini_creds: dict, gpt4o_creds: dict):
"""
Initialize and return the Azure OpenAI LLM based on the selected model.
:param model: The model to initialize ("gpt35", "gpt4o", or "gpt-4o-mini").
:param gpt35_creds: Credentials for gpt35.
:param gpt4o_mini_creds: Credentials for gpt-4o-mini.
:param gpt4o_creds: Credentials for gpt4o.
"""
if model == "gpt35":
return AzureOpenAI(
deployment_name=gpt35_creds["deployment_name"],
temperature=gpt35_creds["temperature"],
api_key=gpt35_creds["api_key"],
azure_endpoint=gpt35_creds["endpoint"],
api_version=gpt35_creds["api_version"]
)
elif model == "gpt-4o-mini":
return AzureOpenAI(
deployment_name=gpt4o_mini_creds["deployment_name"],
temperature=gpt4o_mini_creds["temperature"],
api_key=gpt4o_mini_creds["api_key"],
azure_endpoint=gpt4o_mini_creds["endpoint"],
api_version=gpt4o_mini_creds["api_version"]
)
elif model == "gpt4o":
return AzureOpenAI(
deployment_name=gpt4o_creds["deployment_name"],
temperature=gpt4o_creds["temperature"],
api_key=gpt4o_creds["api_key"],
azure_endpoint=gpt4o_creds["endpoint"],
api_version=gpt4o_creds["api_version"]
)
else:
raise ValueError(f"Invalid model: {model}. Choose from 'gpt35', 'gpt4o', or 'gpt-4o-mini'.")
def create_chat_engine(retriever, memory, llm):
"""Create and return the ContextChatEngine using the provided retriever and memory."""
chat_engine = ContextChatEngine.from_defaults(
retriever=retriever,
memory=memory,
llm=llm
)
return chat_engine
def load_documents(filepaths):
"""
Load and return documents from specified file paths.
:param filepaths: A string (single file path) or a list of strings (multiple file paths).
:return: A list of loaded documents.
"""
loader = PyMuPDFReader()
# If a single string is passed, convert it to a list for consistent handling
if isinstance(filepaths, str):
filepaths = [filepaths]
# Load and accumulate documents
all_documents = []
for filepath in filepaths:
documents = loader.load(file_path=filepath)
all_documents += documents
return all_documents
def create_kg_index(
documents,
storage_context,
llm,
max_triplets_per_chunk=10,
embed_model=HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5"),
include_embeddings=True,
chunk_size=512
):
splitter = SentenceSplitter(chunk_size=chunk_size)
graph_index = KnowledgeGraphIndex.from_documents(
documents,
storage_context=storage_context,
max_triplets_per_chunk=max_triplets_per_chunk,
llm=llm,
embed_model=embed_model,
include_embeddings=include_embeddings,
transformations=[splitter]
)
return graph_index
from llama_index.core.indices.property_graph import SimpleLLMPathExtractor
from llama_index.core.indices.property_graph import DynamicLLMPathExtractor
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore
from llama_index.core import PropertyGraphIndex
def create_pg_index(
llm,
documents,
graph_store,
max_triplets_per_chunk=10,
num_workers=4,
embed_kg_nodes=True,
embed_model=HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")
):
splitter = SentenceSplitter(chunk_size=512)
# Initialize the LLM path extractor
kg_extractor = DynamicLLMPathExtractor(
llm=llm,
max_triplets_per_chunk=max_triplets_per_chunk,
num_workers=num_workers
)
# Create the Property Graph Index
graph_index = PropertyGraphIndex.from_documents(
documents,
property_graph_store=graph_store,
embed_model=embed_model,
embed_kg_nodes=embed_kg_nodes,
kg_extractors=[kg_extractor],
transformations=[splitter]
)
return graph_index |