Autodoc-Lifter / models.py
Jonathan Wang
initial commit
89cbc4d
#####################################################
### DOCUMENT PROCESSOR [MODELS]
#####################################################
# Jonathan Wang
# ABOUT:
# This project creates an app to chat with PDFs.
# This is the LANGUAGE MODELS
# that are used in the document reader.
#####################################################
## TODOS:
# <!> Add support for vLLM / AWQ / GPTQ models. (probably not going to be done due to lack of attention scores)
# Add KTransformers backend?
# https://github.com/kvcache-ai/ktransformers
# https://github.com/Tada-AI/pdf_parser
#####################################################
## IMPORTS:
from __future__ import annotations
import gc
import logging
import sys
from typing import (
Any,
Callable,
Dict,
List,
Optional,
Protocol,
Sequence,
Union,
cast,
runtime_checkable,
)
import streamlit as st
import torch
from llama_index.core.base.embeddings.base import BaseEmbedding
from llama_index.core.base.llms.base import BaseLLM
from llama_index.core.base.llms.generic_utils import (
messages_to_prompt as generic_messages_to_prompt,
)
from llama_index.core.base.llms.types import (
ChatMessage,
ChatResponse,
ChatResponseGen,
CompletionResponse,
CompletionResponseGen,
LLMMetadata,
MessageRole,
)
from llama_index.core.bridge.pydantic import Field, PrivateAttr
from llama_index.core.callbacks import CallbackManager
from llama_index.core.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS
from llama_index.core.llms.callbacks import (
llm_chat_callback,
llm_completion_callback,
)
from llama_index.core.multi_modal_llms import MultiModalLLM
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.prompts.base import PromptTemplate
from llama_index.core.schema import ImageDocument, ImageNode
from llama_index.core.types import BaseOutputParser, PydanticProgramMode
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from PIL import Image as PILImage
from transformers import (
AutoImageProcessor,
AutoModelForVision2Seq,
AutoTokenizer,
LogitsProcessor,
QuantoConfig,
StoppingCriteria,
StoppingCriteriaList,
)
from typing_extensions import Annotated
# from wtpsplit import SaT # Sentence segmentation model. Dropping this. Requires adapters=0.2.1->Transformers=4.39.3 | Phi3 Vision requires Transformers 4.40.2
## NOTE: Proposal for LAZY LOADING packages for running LLMS:
# Currently not done because empahsis is on local inference w/ ability to get Attention Scores, which is not yet supported in non-HF Transformers methods.
## LLamacpp:
# from llama_index.llms.llama_cpp import LlamaCPP
# from llama_index.llms.llama_cpp.llama_utils import (
# messages_to_prompt,
# completion_to_prompt
# )
## HF Transformers LLM:
# from transformers import AutoTokenizer, BitsAndBytesConfig
# from llama_index.llms.huggingface import HuggingFaceLLM
## GROQ
# from llama_index.llms.groq import Groq
#####################################################
### SETTINGS:
DEFAULT_HF_MULTIMODAL_LLM = "Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"
DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW = 1024
DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS = 1024
#####################################################
### CODE:
logger = logging.getLogger(__name__)
@st.cache_resource
def get_embedder(
model_path: str = "mixedbread-ai/mxbai-embed-large-v1",
device: str = "cuda", # 'cpu' is unbearably slow
) -> BaseEmbedding:
"""Given the path to an embedding model, load it."""
# NOTE: okay we definitely could have not made this wrapper, but shrug
return HuggingFaceEmbedding(
model_path,
device=device
)
@st.cache_resource
def get_reranker(
model_path: str = "mixedbread-ai/mxbai-rerank-large-v1",
top_n: int = 3,
device: str = "cpu", # 'cuda' if we were rich
) -> SentenceTransformerRerank: # technically this is a BaseNodePostprocessor, but that seems too abstract.
"""Given the path to a reranking model, load it."""
# NOTE: okay we definitely could have not made this wrapper, but shrug
return SentenceTransformerRerank(
model=model_path,
top_n=top_n,
device=device
)
## LLM Options Below
# def _get_llamacpp_llm(
# model_path: str,
# model_seed: int = 31415926,
# model_temperature: float = 1e-64, # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min']
# model_context_length: Optional[int] = 8192,
# model_max_new_tokens: Optional[int] = 1024,
# ) -> BaseLLM:
# """Load a LlamaCPP model using GPU and other sane defaults."""
# # Lazy Loading
# from llama_index.llms.llama_cpp import LlamaCPP
# from llama_index.llms.llama_cpp.llama_utils import (
# messages_to_prompt,
# completion_to_prompt
# )
# # Arguments to Pass
# llm = LlamaCPP(
# model_path=model_path,
# temperature=model_temperature,
# max_new_tokens=model_max_new_tokens,
# context_window=model_context_length,
# # kwargs to pass to __call__()
# generate_kwargs={'seed': model_seed}, # {'temperature': TEMPERATURE, 'top_p':0.7, 'min_p':0.1, 'seed': MODEL_SEED},
# # kwargs to pass to __init__()
# # set to at least 1 to use GPU
# model_kwargs={'n_gpu_layers': -1, 'n_threads': os.cpu_count()-1}, #, 'rope_freq_scale': 0.83, 'rope_freq_base': 20000},
# # transform inputs into model format
# messages_to_prompt=messages_to_prompt,
# completion_to_prompt=completion_to_prompt,
# verbose=True,
# )
# return (llm)
@st.cache_resource
def _get_hf_llm(
model_path: str,
model_temperature: float = sys.float_info.min, # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min'] to confirm (?)
model_context_length: int | None = 16384,
model_max_new_tokens: int | None = 2048,
hf_quant_level: int | None = 8,
) -> BaseLLM:
"""Load a Huggingface-Transformers based model using sane defaults."""
# Fix temperature if needed; HF implementation complains about it being zero
model_temperature = max(sys.float_info.min, model_temperature)
# Get Quantization with BitsandBytes
quanto_config = None # NOTE: by default, no quantization.
if (hf_quant_level == 4):
# bnb_config = BitsAndBytesConfig(
# # load_in_8bit=True,
# load_in_4bit=True,
# # bnb_4bit_use_double_quant=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype='bfloat16', # NOTE: Tesla T4 GPUs are too crappy for bfloat16
# # bnb_4bit_compute_dtype='float16'
# )
quanto_config = QuantoConfig(
weights="int4" # there's also 'int2' if you're crazy...
)
elif (hf_quant_level == 8):
# bnb_config = BitsAndBytesConfig(
# load_in_8bit=True
# )
quanto_config = QuantoConfig(
weights="int8"
)
# Get Stopping Tokens for Llama3 based models, because they're /special/ and added a new one.
tokenizer = AutoTokenizer.from_pretrained(
model_path
)
stopping_ids = [
tokenizer.eos_token_id,
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
]
return HuggingFaceLLM(
model_name=model_path,
tokenizer_name=model_path,
stopping_ids=stopping_ids,
max_new_tokens=model_max_new_tokens or DEFAULT_NUM_OUTPUTS,
context_window=model_context_length or DEFAULT_CONTEXT_WINDOW,
tokenizer_kwargs={"trust_remote_code": True},
model_kwargs={"trust_remote_code": True, "quantization_config": quanto_config},
generate_kwargs={
"do_sample": not model_temperature > sys.float_info.min,
"temperature": model_temperature,
},
is_chat_model=True,
)
@st.cache_resource
def get_llm(
model_path: str = "meta-llama/Meta-Llama-3.1-8B-Instruct",
model_temperature: float = 0, # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min']
model_context_length: int | None = 8192,
model_max_new_tokens: int | None = 1024,
hf_quant_level: int | None = 8, # 4-bit / 8-bit loading for HF models
) -> BaseLLM:
"""
Given the path to a LLM, determine the type, load it in and convert it into a Llamaindex-compatable LLM.
NOTE: I chose to set some "sane" defaults, so it's probably not as flexible as some other dev would like.
"""
# if (model_path_extension == ".gguf"):
# ##### LLAMA.CPP
# return(_get_llamacpp_llm(model_path, model_seed, model_temperature, model_context_length, model_max_new_tokens))
# TODO(Jonathan Wang): Consider non-HF-Transformers backends
# vLLM support for AWQ/GPTQ models
# I guess reluctantly AutoAWQ and AutoGPTQ packages.
# Exllamav2 is kinda dead IMO.
# else:
#### No extension or weird fake extension suggests a folder, i.e., the base model from HF
return(_get_hf_llm(model_path=model_path, model_temperature=model_temperature, model_context_length=model_context_length, model_max_new_tokens=model_max_new_tokens, hf_quant_level=hf_quant_level))
# @st.cache_resource
# def get_llm() -> BaseLLM:
# from llama_index.llms.groq import Groq
# llm = Groq(
# model='llama-3.1-8b-instant', # old: 'llama3-8b-8192'
# api_key=os.environ.get('GROQ_API_KEY'),
# )
# return (llm)
class EosLogitProcessor(LogitsProcessor):
"""Special snowflake processor for Salesforce Vision Model."""
def __init__(self, eos_token_id: int, end_token_id: int):
super().__init__()
self.eos_token_id = eos_token_id
self.end_token_id = end_token_id
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
if input_ids.size(1) > 1: # Expect at least 1 output token.
forced_eos = torch.full((scores.size(1),), -float("inf"), device=input_ids.device)
forced_eos[self.eos_token_id] = 0
# Force generation of EOS after the <|end|> token.
scores[input_ids[:, -1] == self.end_token_id] = forced_eos
return scores
# NOTE: These two protocols are needed to appease mypy
# https://github.com/run-llama/llama_index/blob/5238b04c183119b3035b84e2663db115e63dcfda/llama-index-core/llama_index/core/llms/llm.py#L89
@runtime_checkable
class MessagesImagesToPromptType(Protocol):
def __call__(self, messages: Sequence[ChatMessage], images: Sequence[ImageDocument], **kwargs: Any) -> str:
pass
MessagesImagesToPromptCallable = Annotated[
Optional[MessagesImagesToPromptType],
WithJsonSchema({"type": "string"}),
]
# https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5/blob/main/batch_inference.ipynb
class HuggingFaceMultiModalLLM(MultiModalLLM):
"""Supposed to be a wrapper around HuggingFace's Vision LLMS.
Currently only supports one model type: Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5
"""
model_name: str = Field(
description='The multi-modal huggingface LLM to use. Currently only using Phi3.',
default=DEFAULT_HF_MULTIMODAL_LLM
)
context_window: int = Field(
default=DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW,
description="The maximum number of tokens available for input.",
gt=0,
)
max_new_tokens: int = Field(
default=DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS,
description="The maximum number of tokens to generate.",
gt=0,
)
system_prompt: str = Field(
default="",
description=(
"The system prompt, containing any extra instructions or context. "
"The model card on HuggingFace should specify if this is needed."
),
)
query_wrapper_prompt: PromptTemplate = Field(
default=PromptTemplate("{query_str}"),
description=(
"The query wrapper prompt, containing the query placeholder. "
"The model card on HuggingFace should specify if this is needed. "
"Should contain a `{query_str}` placeholder."
),
)
tokenizer_name: str = Field(
default=DEFAULT_HF_MULTIMODAL_LLM,
description=(
"The name of the tokenizer to use from HuggingFace. "
"Unused if `tokenizer` is passed in directly."
),
)
processor_name: str = Field(
default=DEFAULT_HF_MULTIMODAL_LLM,
description=(
"The name of the processor to use from HuggingFace. "
"Unused if `processor` is passed in directly."
),
)
device_map: str = Field(
default="auto", description="The device_map to use. Defaults to 'auto'."
)
stopping_ids: list[int] = Field(
default_factory=list,
description=(
"The stopping ids to use. "
"Generation stops when these token IDs are predicted."
),
)
tokenizer_outputs_to_remove: list = Field(
default_factory=list,
description=(
"The outputs to remove from the tokenizer. "
"Sometimes huggingface tokenizers return extra inputs that cause errors."
),
)
tokenizer_kwargs: dict = Field(
default_factory=dict, description="The kwargs to pass to the tokenizer."
)
processor_kwargs: dict = Field(
default_factory=dict, description="The kwargs to pass to the processor."
)
model_kwargs: dict = Field(
default_factory=dict,
description="The kwargs to pass to the model during initialization.",
)
generate_kwargs: dict = Field(
default_factory=dict,
description="The kwargs to pass to the model during generation.",
)
is_chat_model: bool = Field(
default=False,
description=(
"Whether the model can have multiple messages passed at once, like the OpenAI chat API."
# LLMMetadata.__fields__["is_chat_model"].field_info.description
# + " Be sure to verify that you either pass an appropriate tokenizer "
# "that can convert prompts to properly formatted chat messages or a "
# "`messages_to_prompt` that does so."
),
)
messages_images_to_prompt: MessagesImagesToPromptCallable = Field(
default=generic_messages_to_prompt,
description="A function that takes in a list of messages and images and returns a prompt string.",
)
_model: Any = PrivateAttr()
_tokenizer: Any = PrivateAttr()
# TODO(Jonathan Wang): We need to add a separate field for AutoProcessor as opposed to ImageProcessors.
_processor: Any = PrivateAttr()
_stopping_criteria: Any = PrivateAttr()
def __init__(
self,
context_window: int = DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW,
max_new_tokens: int = DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS,
query_wrapper_prompt: Union[str, PromptTemplate] = "{query_str}",
tokenizer_name: str = DEFAULT_HF_MULTIMODAL_LLM,
processor_name: str = DEFAULT_HF_MULTIMODAL_LLM,
model_name: str = DEFAULT_HF_MULTIMODAL_LLM,
model: Any | None = None,
tokenizer: Any | None = None,
processor: Any | None = None,
device_map: str = "auto",
stopping_ids: list[int] | None = None,
tokenizer_kwargs: dict[str, Any] | None = None,
processor_kwargs: dict[str, Any] | None = None,
tokenizer_outputs_to_remove: list[str] | None = None,
model_kwargs: dict[str, Any] | None = None,
generate_kwargs: dict[str, Any] | None = None,
is_chat_model: bool = False,
callback_manager: CallbackManager | None = None,
system_prompt: str = "",
messages_images_to_prompt: Callable[[Sequence[ChatMessage], Sequence[ImageDocument]], str] | None = None,
# completion_to_prompt: Callable[[str], str] | None = None,
# pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
# output_parser: BaseOutputParser | None = None,
) -> None:
logger.info(f"CUDA Memory Pre-AutoModelForVision2Seq: {torch.cuda.mem_get_info()}")
# Salesforce one is a AutoModelForVision2Seq, but not AutoCausalLM which is more common.
model = model or AutoModelForVision2Seq.from_pretrained(
model_name,
device_map=device_map,
trust_remote_code=True,
**(model_kwargs or {})
)
logger.info(f"CUDA Memory Post-AutoModelForVision2Seq: {torch.cuda.mem_get_info()}")
# check context_window
config_dict = model.config.to_dict()
model_context_window = int(
config_dict.get("max_position_embeddings", context_window)
)
if model_context_window < context_window:
logger.warning(
f"Supplied context_window {context_window} is greater "
f"than the model's max input size {model_context_window}. "
"Disable this warning by setting a lower context_window."
)
context_window = model_context_window
processor_kwargs = processor_kwargs or {}
if "max_length" not in processor_kwargs:
processor_kwargs["max_length"] = context_window
# NOTE: Sometimes models (phi-3) will use AutoProcessor and include the tokenizer within it.
logger.info(f"CUDA Memory Pre-Processor: {torch.cuda.mem_get_info()}")
processor = processor or AutoImageProcessor.from_pretrained(
processor_name or model_name,
trust_remote_code=True,
**processor_kwargs
)
logger.info(f"CUDA Memory Post-Processor: {torch.cuda.mem_get_info()}")
tokenizer = tokenizer or AutoTokenizer.from_pretrained(
tokenizer_name or model_name,
trust_remote_code=True,
**(tokenizer_kwargs or {})
)
logger.info(f"CUDA Memory Post-Tokenizer: {torch.cuda.mem_get_info()}")
# Tokenizer-Model disagreement
if (hasattr(tokenizer, "name_or_path") and tokenizer.name_or_path != model_name): # type: ignore (checked for attribute)
logger.warning(
f"The model `{model_name}` and processor `{getattr(tokenizer, 'name_or_path', None)}` "
f"are different, please ensure that they are compatible."
)
# Processor-Model disagreement
if (hasattr(processor, "name_or_path") and getattr(processor, "name_or_path", None) != model_name):
logger.warning(
f"The model `{model_name}` and processor `{getattr(processor, 'name_or_path', None)}` "
f"are different, please ensure that they are compatible."
)
# setup stopping criteria
stopping_ids_list = stopping_ids or []
class StopOnTokens(StoppingCriteria):
def __call__(
self,
input_ids: torch.LongTensor,
scores: torch.FloatTensor,
**kwargs: Any,
) -> bool:
return any(input_ids[0][-1] == stop_id for stop_id in stopping_ids_list)
stopping_criteria = StoppingCriteriaList([StopOnTokens()])
if isinstance(query_wrapper_prompt, str):
query_wrapper_prompt = PromptTemplate(query_wrapper_prompt)
messages_images_to_prompt = messages_images_to_prompt or self._processor_messages_to_prompt
# Initiate standard LLM
super().__init__(
callback_manager=callback_manager or CallbackManager([]),
)
logger.info(f"CUDA Memory Post-SuperInit: {torch.cuda.mem_get_info()}")
# Initiate remaining fields
self._model = model
self._tokenizer = tokenizer
self._processor = processor
logger.info(f"CUDA Memory Post-Init: {torch.cuda.mem_get_info()}")
self._stopping_criteria = stopping_criteria
self.model_name = model_name
self.context_window=context_window
self.max_new_tokens=max_new_tokens
self.system_prompt=system_prompt
self.query_wrapper_prompt=query_wrapper_prompt
self.tokenizer_name=tokenizer_name
self.processor_name=processor_name
self.model_name=model_name
self.device_map=device_map
self.stopping_ids=stopping_ids or []
self.tokenizer_outputs_to_remove=tokenizer_outputs_to_remove or []
self.tokenizer_kwargs=tokenizer_kwargs or {}
self.processor_kwargs=processor_kwargs or {}
self.model_kwargs=model_kwargs or {}
self.generate_kwargs=generate_kwargs or {}
self.is_chat_model=is_chat_model
self.messages_images_to_prompt=messages_images_to_prompt
# self.completion_to_prompt=completion_to_prompt,
# self.pydantic_program_mode=pydantic_program_mode,
# self.output_parser=output_parser,
@classmethod
def class_name(cls) -> str:
return "HuggingFace_MultiModal_LLM"
@property
def metadata(self) -> LLMMetadata:
"""LLM metadata."""
return LLMMetadata(
context_window=self.context_window,
num_output=self.max_new_tokens,
model_name=self.model_name,
is_chat_model=self.is_chat_model,
)
def _processor_messages_to_prompt(self, messages: Sequence[ChatMessage], images: Sequence[ImageDocument]) -> str:
### TODO(Jonathan Wang): Make this work generically. Currently we're building for `Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5`
"""Converts a list of messages into a prompt for the multimodal LLM.
NOTE: we assume for simplicity here that these images are related, and not the user bouncing between multiple different topics. Thus, we send them all at once.
Args:
messages (Sequence[ChatMessage]): A list of the messages to convert, where each message is a dict containing the message role and content.
images (Sequence[ImageDocument]): The number of images the user is passing to the MultiModalLLM.
Returns:
str: The prompt.
"""
# NOTE: For `Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5`, we actually ignore the `images`; no plaaceholders.
"""Use the tokenizer to convert messages to prompt. Fallback to generic."""
if hasattr(self._tokenizer, "apply_chat_template"):
messages_dict = [
{"role": message.role.value, "content": message.content}
for message in messages
]
return self._tokenizer.apply_chat_template(
messages_dict, tokenize=False, add_generation_prompt=True
)
return generic_messages_to_prompt(messages)
@llm_completion_callback()
def complete(
self,
prompt: str,
image_documents: ImageNode | List[ImageNode] | ImageDocument | List[ImageDocument], # this also takes ImageDocument which inherits from ImageNode.
formatted: bool = False,
**kwargs: Any
) -> CompletionResponse:
"""Given a prompt and image node(s), get the Phi-3 Vision prompt"""
# Handle images input
# https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5/blob/main/demo.ipynb
batch_image_list = []
batch_image_sizes = []
batch_prompt = []
# Fix image_documents input typing
if (not isinstance(image_documents, list)):
image_documents = [image_documents]
image_documents = [cast(ImageDocument, image) for image in image_documents] # we probably won't be using the Document features, so I think this is fine.
# Convert input images into PIL images for the model.
image_list = []
image_sizes = []
for image in image_documents:
# NOTE: ImageDocument inherets from ImageNode. We'll go extract the image.
image_io = image.resolve_image()
image_pil = PILImage.open(image_io)
image_list.append(self._processor([image_pil], image_aspect_ratio='anyres')['pixel_values'].to(self._model.device))
image_sizes.append(image_pil.size)
batch_image_list.append(image_list)
batch_image_sizes.append(image_sizes)
batch_prompt.append(prompt) # only one question per image
# Get the prompt
if not formatted and self.query_wrapper_prompt:
prompt = self.query_wrapper_prompt.format(query_str=prompt)
prompt_sequence = []
if self.system_prompt:
prompt_sequence.append(ChatMessage(role=MessageRole.SYSTEM, content=self.system_prompt))
prompt_sequence.append(ChatMessage(role=MessageRole.USER, content=prompt))
prompt = self.messages_images_to_prompt(messages=prompt_sequence, images=image_documents)
# Get the model input
batch_inputs = {
"pixel_values": batch_image_list
}
language_inputs = self._tokenizer(
[prompt],
return_tensors="pt",
padding='longest', # probably not needed.
max_length=self._tokenizer.model_max_length,
truncation=True
).to(self._model.device)
# TODO: why does the example cookbook have this weird conversion to Cuda instead of .to(device)?
# language_inputs = {name: tensor.cuda() for name, tensor in language_inputs.items()}
batch_inputs.update(language_inputs)
gc.collect()
torch.cuda.empty_cache()
# remove keys from the tokenizer if needed, to avoid HF errors
# TODO: this probably is broken and wouldn't work.
for key in self.tokenizer_outputs_to_remove:
if key in batch_inputs:
batch_inputs.pop(key, None)
# Get output
tokens = self._model.generate(
**batch_inputs,
image_sizes=batch_image_sizes,
pad_token_id=self._tokenizer.pad_token_id,
eos_token_id=self._tokenizer.eos_token_id,
max_new_tokens=self.max_new_tokens,
stopping_criteria=self._stopping_criteria,
# NOTE: Special snowflake processor for Salesforce XGEN Phi3 Mini.
logits_processor=[EosLogitProcessor(eos_token_id=self._tokenizer.eos_token_id, end_token_id=32007)],
**self.generate_kwargs
)
gc.collect()
torch.cuda.empty_cache()
# completion_tokens = tokens[:, batch_inputs['input_ids'].shape[1]:]
completion = self._tokenizer.batch_decode(
tokens,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
gc.collect()
torch.cuda.empty_cache()
output = CompletionResponse(text=completion, raw={'model_output': tokens})
# Clean stuff up
del batch_image_list, batch_image_sizes, batch_inputs, tokens, completion
gc.collect()
torch.cuda.empty_cache()
# Return the completion
return output
@llm_completion_callback()
def stream_complete(
self, prompt: str, formatted: bool = False, **kwargs: Any
) -> CompletionResponseGen:
raise NotImplementedError
@llm_chat_callback()
def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
raise NotImplementedError
@llm_chat_callback()
def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen:
raise NotImplementedError
@llm_completion_callback()
async def acomplete(
self,
prompt: str,
images: ImageNode | List[ImageNode], # this also takes ImageDocument which inherits from ImageNode.
formatted: bool = False,
**kwargs: Any
) -> CompletionResponse:
raise NotImplementedError
@llm_completion_callback()
async def astream_complete(
self, prompt: str, formatted: bool = False, **kwargs: Any
) -> CompletionResponseGen:
raise NotImplementedError
@llm_chat_callback()
async def achat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
raise NotImplementedError
@llm_chat_callback()
async def astream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen:
raise NotImplementedError
# @st.cache_resource()
# def get_multimodal_llm(**kwargs) -> MultiModalLLM:
# vision_llm = OpenAIMultiModal(
# model='gpt-4o-mini',
# temperature=0,
# max_new_tokens=512,
# image_detail='auto'
# )
# return (vision_llm)
@st.cache_resource
def get_multimodal_llm(
model_name: str = DEFAULT_HF_MULTIMODAL_LLM,
device_map: str = "cuda", # does not support 'auto'
processor_kwargs: dict[str, Any] | None = None,
model_kwargs: dict[str, Any] | None = None, # {'torch_dtype': torch.bfloat16}, # {'torch_dtype': torch.float8_e5m2}
generate_kwargs: dict[str, Any] | None = None, # from the example cookbook
hf_quant_level: int | None = 8,
) -> HuggingFaceMultiModalLLM:
# Get default generate kwargs
if model_kwargs is None:
model_kwargs = {}
if processor_kwargs is None:
processor_kwargs = {}
if generate_kwargs is None:
generate_kwargs = {
"temperature": sys.float_info.min,
"top_p": None,
"num_beams": 1
# NOTE: we hack in EOSLogitProcessor in the HuggingFaceMultiModalLLM because it allows us to get the tokenizer.eos_token_id
}
# Get Quantization with Quanto
quanto_config = None # NOTE: by default, no quantization.
if (hf_quant_level == 4):
# bnb_config = BitsAndBytesConfig(
# # load_in_8bit=True,
# load_in_4bit=True,
# # bnb_4bit_use_double_quant=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype='bfloat16', # NOTE: Tesla T4 GPUs are too crappy for bfloat16
# # bnb_4bit_compute_dtype='float16'
# )
quanto_config = QuantoConfig(
weights="int4" # there's also 'int2' if you're crazy...
)
elif (hf_quant_level == 8):
# bnb_config = BitsAndBytesConfig(
# load_in_8bit=True
# )
quanto_config = QuantoConfig(
weights="int8"
)
if (quanto_config is not None):
model_kwargs["quantization_config"] = quanto_config
return HuggingFaceMultiModalLLM(
model_name=model_name,
device_map=device_map,
processor_kwargs=processor_kwargs,
model_kwargs=model_kwargs,
generate_kwargs=generate_kwargs,
max_new_tokens=1024 # from the example cookbook
)