Spaces:

jdwh08s
/

Autodoc-Lifter

Paused

Autodoc-Lifter / models.py

Jonathan Wang

initial commit

89cbc4d 5 months ago

31 kB

	#####################################################
	### DOCUMENT PROCESSOR [MODELS]
	#####################################################
	# Jonathan Wang

	# ABOUT:
	# This project creates an app to chat with PDFs.

	# This is the LANGUAGE MODELS
	# that are used in the document reader.
	#####################################################
	## TODOS:
	# <!> Add support for vLLM / AWQ / GPTQ models. (probably not going to be done due to lack of attention scores)

	# Add KTransformers backend?
	# https://github.com/kvcache-ai/ktransformers

	# https://github.com/Tada-AI/pdf_parser

	#####################################################
	## IMPORTS:
	from __future__ import annotations

	import gc
	import logging
	import sys
	from typing import (
	Any,
	Callable,
	Dict,
	List,
	Optional,
	Protocol,
	Sequence,
	Union,
	cast,
	runtime_checkable,
	)

	import streamlit as st
	import torch
	from llama_index.core.base.embeddings.base import BaseEmbedding
	from llama_index.core.base.llms.base import BaseLLM
	from llama_index.core.base.llms.generic_utils import (
	messages_to_prompt as generic_messages_to_prompt,
	)
	from llama_index.core.base.llms.types import (
	ChatMessage,
	ChatResponse,
	ChatResponseGen,
	CompletionResponse,
	CompletionResponseGen,
	LLMMetadata,
	MessageRole,
	)
	from llama_index.core.bridge.pydantic import Field, PrivateAttr
	from llama_index.core.callbacks import CallbackManager
	from llama_index.core.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS
	from llama_index.core.llms.callbacks import (
	llm_chat_callback,
	llm_completion_callback,
	)
	from llama_index.core.multi_modal_llms import MultiModalLLM
	from llama_index.core.postprocessor import SentenceTransformerRerank
	from llama_index.core.prompts.base import PromptTemplate
	from llama_index.core.schema import ImageDocument, ImageNode
	from llama_index.core.types import BaseOutputParser, PydanticProgramMode
	from llama_index.embeddings.huggingface import HuggingFaceEmbedding
	from llama_index.llms.huggingface import HuggingFaceLLM
	from PIL import Image as PILImage
	from transformers import (
	AutoImageProcessor,
	AutoModelForVision2Seq,
	AutoTokenizer,
	LogitsProcessor,
	QuantoConfig,
	StoppingCriteria,
	StoppingCriteriaList,
	)
	from typing_extensions import Annotated

	# from wtpsplit import SaT # Sentence segmentation model. Dropping this. Requires adapters=0.2.1->Transformers=4.39.3 \| Phi3 Vision requires Transformers 4.40.2

	## NOTE: Proposal for LAZY LOADING packages for running LLMS:
	# Currently not done because empahsis is on local inference w/ ability to get Attention Scores, which is not yet supported in non-HF Transformers methods.

	## LLamacpp:
	# from llama_index.llms.llama_cpp import LlamaCPP
	# from llama_index.llms.llama_cpp.llama_utils import (
	# messages_to_prompt,
	# completion_to_prompt
	# )

	## HF Transformers LLM:
	# from transformers import AutoTokenizer, BitsAndBytesConfig
	# from llama_index.llms.huggingface import HuggingFaceLLM

	## GROQ
	# from llama_index.llms.groq import Groq

	#####################################################
	### SETTINGS:
	DEFAULT_HF_MULTIMODAL_LLM = "Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5"
	DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW = 1024
	DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS = 1024

	#####################################################
	### CODE:
	logger = logging.getLogger(__name__)

	@st.cache_resource
	def get_embedder(
	model_path: str = "mixedbread-ai/mxbai-embed-large-v1",
	device: str = "cuda", # 'cpu' is unbearably slow
	) -> BaseEmbedding:
	"""Given the path to an embedding model, load it."""
	# NOTE: okay we definitely could have not made this wrapper, but shrug
	return HuggingFaceEmbedding(
	model_path,
	device=device
	)


	@st.cache_resource
	def get_reranker(
	model_path: str = "mixedbread-ai/mxbai-rerank-large-v1",
	top_n: int = 3,
	device: str = "cpu", # 'cuda' if we were rich
	) -> SentenceTransformerRerank: # technically this is a BaseNodePostprocessor, but that seems too abstract.
	"""Given the path to a reranking model, load it."""
	# NOTE: okay we definitely could have not made this wrapper, but shrug
	return SentenceTransformerRerank(
	model=model_path,
	top_n=top_n,
	device=device
	)


	## LLM Options Below
	# def _get_llamacpp_llm(
	# model_path: str,
	# model_seed: int = 31415926,
	# model_temperature: float = 1e-64, # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min']
	# model_context_length: Optional[int] = 8192,
	# model_max_new_tokens: Optional[int] = 1024,
	# ) -> BaseLLM:
	# """Load a LlamaCPP model using GPU and other sane defaults."""
	# # Lazy Loading
	# from llama_index.llms.llama_cpp import LlamaCPP
	# from llama_index.llms.llama_cpp.llama_utils import (
	# messages_to_prompt,
	# completion_to_prompt
	# )

	# # Arguments to Pass
	# llm = LlamaCPP(
	# model_path=model_path,
	# temperature=model_temperature,
	# max_new_tokens=model_max_new_tokens,
	# context_window=model_context_length,
	# # kwargs to pass to __call__()
	# generate_kwargs={'seed': model_seed}, # {'temperature': TEMPERATURE, 'top_p':0.7, 'min_p':0.1, 'seed': MODEL_SEED},
	# # kwargs to pass to __init__()
	# # set to at least 1 to use GPU
	# model_kwargs={'n_gpu_layers': -1, 'n_threads': os.cpu_count()-1}, #, 'rope_freq_scale': 0.83, 'rope_freq_base': 20000},
	# # transform inputs into model format
	# messages_to_prompt=messages_to_prompt,
	# completion_to_prompt=completion_to_prompt,
	# verbose=True,
	# )
	# return (llm)


	@st.cache_resource
	def _get_hf_llm(
	model_path: str,
	model_temperature: float = sys.float_info.min, # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min'] to confirm (?)
	model_context_length: int \| None = 16384,
	model_max_new_tokens: int \| None = 2048,
	hf_quant_level: int \| None = 8,
	) -> BaseLLM:
	"""Load a Huggingface-Transformers based model using sane defaults."""
	# Fix temperature if needed; HF implementation complains about it being zero
	model_temperature = max(sys.float_info.min, model_temperature)

	# Get Quantization with BitsandBytes
	quanto_config = None # NOTE: by default, no quantization.
	if (hf_quant_level == 4):
	# bnb_config = BitsAndBytesConfig(
	# # load_in_8bit=True,
	# load_in_4bit=True,
	# # bnb_4bit_use_double_quant=True,
	# bnb_4bit_quant_type="nf4",
	# bnb_4bit_compute_dtype='bfloat16', # NOTE: Tesla T4 GPUs are too crappy for bfloat16
	# # bnb_4bit_compute_dtype='float16'
	# )
	quanto_config = QuantoConfig(
	weights="int4" # there's also 'int2' if you're crazy...
	)
	elif (hf_quant_level == 8):
	# bnb_config = BitsAndBytesConfig(
	# load_in_8bit=True
	# )
	quanto_config = QuantoConfig(
	weights="int8"
	)

	# Get Stopping Tokens for Llama3 based models, because they're /special/ and added a new one.
	tokenizer = AutoTokenizer.from_pretrained(
	model_path
	)
	stopping_ids = [
	tokenizer.eos_token_id,
	tokenizer.convert_tokens_to_ids("<\|eot_id\|>"),
	]
	return HuggingFaceLLM(
	model_name=model_path,
	tokenizer_name=model_path,
	stopping_ids=stopping_ids,
	max_new_tokens=model_max_new_tokens or DEFAULT_NUM_OUTPUTS,
	context_window=model_context_length or DEFAULT_CONTEXT_WINDOW,
	tokenizer_kwargs={"trust_remote_code": True},
	model_kwargs={"trust_remote_code": True, "quantization_config": quanto_config},
	generate_kwargs={
	"do_sample": not model_temperature > sys.float_info.min,
	"temperature": model_temperature,
	},
	is_chat_model=True,
	)


	@st.cache_resource
	def get_llm(
	model_path: str = "meta-llama/Meta-Llama-3.1-8B-Instruct",
	model_temperature: float = 0, # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min']
	model_context_length: int \| None = 8192,
	model_max_new_tokens: int \| None = 1024,

	hf_quant_level: int \| None = 8, # 4-bit / 8-bit loading for HF models
	) -> BaseLLM:
	"""
	Given the path to a LLM, determine the type, load it in and convert it into a Llamaindex-compatable LLM.

	NOTE: I chose to set some "sane" defaults, so it's probably not as flexible as some other dev would like.
	"""
	# if (model_path_extension == ".gguf"):
	# ##### LLAMA.CPP
	# return(_get_llamacpp_llm(model_path, model_seed, model_temperature, model_context_length, model_max_new_tokens))

	# TODO(Jonathan Wang): Consider non-HF-Transformers backends
	# vLLM support for AWQ/GPTQ models
	# I guess reluctantly AutoAWQ and AutoGPTQ packages.
	# Exllamav2 is kinda dead IMO.

	# else:
	#### No extension or weird fake extension suggests a folder, i.e., the base model from HF
	return(_get_hf_llm(model_path=model_path, model_temperature=model_temperature, model_context_length=model_context_length, model_max_new_tokens=model_max_new_tokens, hf_quant_level=hf_quant_level))


	# @st.cache_resource
	# def get_llm() -> BaseLLM:
	# from llama_index.llms.groq import Groq

	# llm = Groq(
	# model='llama-3.1-8b-instant', # old: 'llama3-8b-8192'
	# api_key=os.environ.get('GROQ_API_KEY'),
	# )
	# return (llm)


	class EosLogitProcessor(LogitsProcessor):
	"""Special snowflake processor for Salesforce Vision Model."""
	def __init__(self, eos_token_id: int, end_token_id: int):
	super().__init__()
	self.eos_token_id = eos_token_id
	self.end_token_id = end_token_id

	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
	if input_ids.size(1) > 1: # Expect at least 1 output token.
	forced_eos = torch.full((scores.size(1),), -float("inf"), device=input_ids.device)
	forced_eos[self.eos_token_id] = 0

	# Force generation of EOS after the <\|end\|> token.
	scores[input_ids[:, -1] == self.end_token_id] = forced_eos
	return scores

	# NOTE: These two protocols are needed to appease mypy
	# https://github.com/run-llama/llama_index/blob/5238b04c183119b3035b84e2663db115e63dcfda/llama-index-core/llama_index/core/llms/llm.py#L89
	@runtime_checkable
	class MessagesImagesToPromptType(Protocol):
	def __call__(self, messages: Sequence[ChatMessage], images: Sequence[ImageDocument], **kwargs: Any) -> str:
	pass

	MessagesImagesToPromptCallable = Annotated[
	Optional[MessagesImagesToPromptType],
	WithJsonSchema({"type": "string"}),
	]


	# https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5/blob/main/batch_inference.ipynb

	class HuggingFaceMultiModalLLM(MultiModalLLM):
	"""Supposed to be a wrapper around HuggingFace's Vision LLMS.
	Currently only supports one model type: Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5
	"""

	model_name: str = Field(
	description='The multi-modal huggingface LLM to use. Currently only using Phi3.',
	default=DEFAULT_HF_MULTIMODAL_LLM
	)
	context_window: int = Field(
	default=DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW,
	description="The maximum number of tokens available for input.",
	gt=0,
	)
	max_new_tokens: int = Field(
	default=DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS,
	description="The maximum number of tokens to generate.",
	gt=0,
	)
	system_prompt: str = Field(
	default="",
	description=(
	"The system prompt, containing any extra instructions or context. "
	"The model card on HuggingFace should specify if this is needed."
	),
	)
	query_wrapper_prompt: PromptTemplate = Field(
	default=PromptTemplate("{query_str}"),
	description=(
	"The query wrapper prompt, containing the query placeholder. "
	"The model card on HuggingFace should specify if this is needed. "
	"Should contain a `{query_str}` placeholder."
	),
	)
	tokenizer_name: str = Field(
	default=DEFAULT_HF_MULTIMODAL_LLM,
	description=(
	"The name of the tokenizer to use from HuggingFace. "
	"Unused if `tokenizer` is passed in directly."
	),
	)
	processor_name: str = Field(
	default=DEFAULT_HF_MULTIMODAL_LLM,
	description=(
	"The name of the processor to use from HuggingFace. "
	"Unused if `processor` is passed in directly."
	),
	)
	device_map: str = Field(
	default="auto", description="The device_map to use. Defaults to 'auto'."
	)
	stopping_ids: list[int] = Field(
	default_factory=list,
	description=(
	"The stopping ids to use. "
	"Generation stops when these token IDs are predicted."
	),
	)
	tokenizer_outputs_to_remove: list = Field(
	default_factory=list,
	description=(
	"The outputs to remove from the tokenizer. "
	"Sometimes huggingface tokenizers return extra inputs that cause errors."
	),
	)
	tokenizer_kwargs: dict = Field(
	default_factory=dict, description="The kwargs to pass to the tokenizer."
	)
	processor_kwargs: dict = Field(
	default_factory=dict, description="The kwargs to pass to the processor."
	)
	model_kwargs: dict = Field(
	default_factory=dict,
	description="The kwargs to pass to the model during initialization.",
	)
	generate_kwargs: dict = Field(
	default_factory=dict,
	description="The kwargs to pass to the model during generation.",
	)
	is_chat_model: bool = Field(
	default=False,
	description=(
	"Whether the model can have multiple messages passed at once, like the OpenAI chat API."
	# LLMMetadata.__fields__["is_chat_model"].field_info.description
	# + " Be sure to verify that you either pass an appropriate tokenizer "
	# "that can convert prompts to properly formatted chat messages or a "
	# "`messages_to_prompt` that does so."
	),
	)
	messages_images_to_prompt: MessagesImagesToPromptCallable = Field(
	default=generic_messages_to_prompt,
	description="A function that takes in a list of messages and images and returns a prompt string.",
	)

	_model: Any = PrivateAttr()
	_tokenizer: Any = PrivateAttr()
	# TODO(Jonathan Wang): We need to add a separate field for AutoProcessor as opposed to ImageProcessors.
	_processor: Any = PrivateAttr()
	_stopping_criteria: Any = PrivateAttr()

	def __init__(
	self,
	context_window: int = DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW,
	max_new_tokens: int = DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS,
	query_wrapper_prompt: Union[str, PromptTemplate] = "{query_str}",
	tokenizer_name: str = DEFAULT_HF_MULTIMODAL_LLM,
	processor_name: str = DEFAULT_HF_MULTIMODAL_LLM,
	model_name: str = DEFAULT_HF_MULTIMODAL_LLM,
	model: Any \| None = None,
	tokenizer: Any \| None = None,
	processor: Any \| None = None,
	device_map: str = "auto",
	stopping_ids: list[int] \| None = None,
	tokenizer_kwargs: dict[str, Any] \| None = None,
	processor_kwargs: dict[str, Any] \| None = None,
	tokenizer_outputs_to_remove: list[str] \| None = None,
	model_kwargs: dict[str, Any] \| None = None,
	generate_kwargs: dict[str, Any] \| None = None,
	is_chat_model: bool = False,
	callback_manager: CallbackManager \| None = None,
	system_prompt: str = "",
	messages_images_to_prompt: Callable[[Sequence[ChatMessage], Sequence[ImageDocument]], str] \| None = None,
	# completion_to_prompt: Callable[[str], str] \| None = None,
	# pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
	# output_parser: BaseOutputParser \| None = None,
	) -> None:

	logger.info(f"CUDA Memory Pre-AutoModelForVision2Seq: {torch.cuda.mem_get_info()}")
	# Salesforce one is a AutoModelForVision2Seq, but not AutoCausalLM which is more common.
	model = model or AutoModelForVision2Seq.from_pretrained(
	model_name,
	device_map=device_map,
	trust_remote_code=True,
	**(model_kwargs or {})
	)
	logger.info(f"CUDA Memory Post-AutoModelForVision2Seq: {torch.cuda.mem_get_info()}")

	# check context_window
	config_dict = model.config.to_dict()
	model_context_window = int(
	config_dict.get("max_position_embeddings", context_window)
	)
	if model_context_window < context_window:
	logger.warning(
	f"Supplied context_window {context_window} is greater "
	f"than the model's max input size {model_context_window}. "
	"Disable this warning by setting a lower context_window."
	)
	context_window = model_context_window

	processor_kwargs = processor_kwargs or {}
	if "max_length" not in processor_kwargs:
	processor_kwargs["max_length"] = context_window

	# NOTE: Sometimes models (phi-3) will use AutoProcessor and include the tokenizer within it.
	logger.info(f"CUDA Memory Pre-Processor: {torch.cuda.mem_get_info()}")
	processor = processor or AutoImageProcessor.from_pretrained(
	processor_name or model_name,
	trust_remote_code=True,
	**processor_kwargs
	)
	logger.info(f"CUDA Memory Post-Processor: {torch.cuda.mem_get_info()}")

	tokenizer = tokenizer or AutoTokenizer.from_pretrained(
	tokenizer_name or model_name,
	trust_remote_code=True,
	**(tokenizer_kwargs or {})
	)
	logger.info(f"CUDA Memory Post-Tokenizer: {torch.cuda.mem_get_info()}")

	# Tokenizer-Model disagreement
	if (hasattr(tokenizer, "name_or_path") and tokenizer.name_or_path != model_name): # type: ignore (checked for attribute)
	logger.warning(
	f"The model `{model_name}` and processor `{getattr(tokenizer, 'name_or_path', None)}` "
	f"are different, please ensure that they are compatible."
	)
	# Processor-Model disagreement
	if (hasattr(processor, "name_or_path") and getattr(processor, "name_or_path", None) != model_name):
	logger.warning(
	f"The model `{model_name}` and processor `{getattr(processor, 'name_or_path', None)}` "
	f"are different, please ensure that they are compatible."
	)

	# setup stopping criteria
	stopping_ids_list = stopping_ids or []

	class StopOnTokens(StoppingCriteria):
	def __call__(
	self,
	input_ids: torch.LongTensor,
	scores: torch.FloatTensor,
	**kwargs: Any,
	) -> bool:
	return any(input_ids[0][-1] == stop_id for stop_id in stopping_ids_list)

	stopping_criteria = StoppingCriteriaList([StopOnTokens()])

	if isinstance(query_wrapper_prompt, str):
	query_wrapper_prompt = PromptTemplate(query_wrapper_prompt)

	messages_images_to_prompt = messages_images_to_prompt or self._processor_messages_to_prompt

	# Initiate standard LLM
	super().__init__(
	callback_manager=callback_manager or CallbackManager([]),
	)
	logger.info(f"CUDA Memory Post-SuperInit: {torch.cuda.mem_get_info()}")

	# Initiate remaining fields
	self._model = model
	self._tokenizer = tokenizer
	self._processor = processor
	logger.info(f"CUDA Memory Post-Init: {torch.cuda.mem_get_info()}")
	self._stopping_criteria = stopping_criteria
	self.model_name = model_name
	self.context_window=context_window
	self.max_new_tokens=max_new_tokens
	self.system_prompt=system_prompt
	self.query_wrapper_prompt=query_wrapper_prompt
	self.tokenizer_name=tokenizer_name
	self.processor_name=processor_name
	self.model_name=model_name
	self.device_map=device_map
	self.stopping_ids=stopping_ids or []
	self.tokenizer_outputs_to_remove=tokenizer_outputs_to_remove or []
	self.tokenizer_kwargs=tokenizer_kwargs or {}
	self.processor_kwargs=processor_kwargs or {}
	self.model_kwargs=model_kwargs or {}
	self.generate_kwargs=generate_kwargs or {}
	self.is_chat_model=is_chat_model
	self.messages_images_to_prompt=messages_images_to_prompt
	# self.completion_to_prompt=completion_to_prompt,
	# self.pydantic_program_mode=pydantic_program_mode,
	# self.output_parser=output_parser,

	@classmethod
	def class_name(cls) -> str:
	return "HuggingFace_MultiModal_LLM"

	@property
	def metadata(self) -> LLMMetadata:
	"""LLM metadata."""
	return LLMMetadata(
	context_window=self.context_window,
	num_output=self.max_new_tokens,
	model_name=self.model_name,
	is_chat_model=self.is_chat_model,
	)

	def _processor_messages_to_prompt(self, messages: Sequence[ChatMessage], images: Sequence[ImageDocument]) -> str:
	### TODO(Jonathan Wang): Make this work generically. Currently we're building for `Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5`
	"""Converts a list of messages into a prompt for the multimodal LLM.
	NOTE: we assume for simplicity here that these images are related, and not the user bouncing between multiple different topics. Thus, we send them all at once.

	Args:
	messages (Sequence[ChatMessage]): A list of the messages to convert, where each message is a dict containing the message role and content.
	images (Sequence[ImageDocument]): The number of images the user is passing to the MultiModalLLM.
	Returns:
	str: The prompt.
	"""
	# NOTE: For `Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5`, we actually ignore the `images`; no plaaceholders.

	"""Use the tokenizer to convert messages to prompt. Fallback to generic."""
	if hasattr(self._tokenizer, "apply_chat_template"):
	messages_dict = [
	{"role": message.role.value, "content": message.content}
	for message in messages
	]
	return self._tokenizer.apply_chat_template(
	messages_dict, tokenize=False, add_generation_prompt=True
	)

	return generic_messages_to_prompt(messages)

	@llm_completion_callback()
	def complete(
	self,
	prompt: str,
	image_documents: ImageNode \| List[ImageNode] \| ImageDocument \| List[ImageDocument], # this also takes ImageDocument which inherits from ImageNode.
	formatted: bool = False,
	**kwargs: Any
	) -> CompletionResponse:
	"""Given a prompt and image node(s), get the Phi-3 Vision prompt"""
	# Handle images input
	# https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5/blob/main/demo.ipynb
	batch_image_list = []
	batch_image_sizes = []
	batch_prompt = []

	# Fix image_documents input typing
	if (not isinstance(image_documents, list)):
	image_documents = [image_documents]
	image_documents = [cast(ImageDocument, image) for image in image_documents] # we probably won't be using the Document features, so I think this is fine.

	# Convert input images into PIL images for the model.
	image_list = []
	image_sizes = []
	for image in image_documents:
	# NOTE: ImageDocument inherets from ImageNode. We'll go extract the image.
	image_io = image.resolve_image()
	image_pil = PILImage.open(image_io)
	image_list.append(self._processor([image_pil], image_aspect_ratio='anyres')['pixel_values'].to(self._model.device))
	image_sizes.append(image_pil.size)

	batch_image_list.append(image_list)
	batch_image_sizes.append(image_sizes)
	batch_prompt.append(prompt) # only one question per image

	# Get the prompt
	if not formatted and self.query_wrapper_prompt:
	prompt = self.query_wrapper_prompt.format(query_str=prompt)

	prompt_sequence = []
	if self.system_prompt:
	prompt_sequence.append(ChatMessage(role=MessageRole.SYSTEM, content=self.system_prompt))
	prompt_sequence.append(ChatMessage(role=MessageRole.USER, content=prompt))

	prompt = self.messages_images_to_prompt(messages=prompt_sequence, images=image_documents)

	# Get the model input
	batch_inputs = {
	"pixel_values": batch_image_list
	}
	language_inputs = self._tokenizer(
	[prompt],
	return_tensors="pt",
	padding='longest', # probably not needed.
	max_length=self._tokenizer.model_max_length,
	truncation=True
	).to(self._model.device)
	# TODO: why does the example cookbook have this weird conversion to Cuda instead of .to(device)?
	# language_inputs = {name: tensor.cuda() for name, tensor in language_inputs.items()}
	batch_inputs.update(language_inputs)

	gc.collect()
	torch.cuda.empty_cache()

	# remove keys from the tokenizer if needed, to avoid HF errors
	# TODO: this probably is broken and wouldn't work.
	for key in self.tokenizer_outputs_to_remove:
	if key in batch_inputs:
	batch_inputs.pop(key, None)

	# Get output
	tokens = self._model.generate(
	**batch_inputs,
	image_sizes=batch_image_sizes,
	pad_token_id=self._tokenizer.pad_token_id,
	eos_token_id=self._tokenizer.eos_token_id,
	max_new_tokens=self.max_new_tokens,
	stopping_criteria=self._stopping_criteria,
	# NOTE: Special snowflake processor for Salesforce XGEN Phi3 Mini.
	logits_processor=[EosLogitProcessor(eos_token_id=self._tokenizer.eos_token_id, end_token_id=32007)],
	**self.generate_kwargs
	)
	gc.collect()
	torch.cuda.empty_cache()

	# completion_tokens = tokens[:, batch_inputs['input_ids'].shape[1]:]
	completion = self._tokenizer.batch_decode(
	tokens,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False
	)[0]
	gc.collect()
	torch.cuda.empty_cache()

	output = CompletionResponse(text=completion, raw={'model_output': tokens})

	# Clean stuff up
	del batch_image_list, batch_image_sizes, batch_inputs, tokens, completion
	gc.collect()
	torch.cuda.empty_cache()

	# Return the completion
	return output

	@llm_completion_callback()
	def stream_complete(
	self, prompt: str, formatted: bool = False, **kwargs: Any
	) -> CompletionResponseGen:
	raise NotImplementedError

	@llm_chat_callback()
	def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
	raise NotImplementedError

	@llm_chat_callback()
	def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen:
	raise NotImplementedError

	@llm_completion_callback()
	async def acomplete(
	self,
	prompt: str,
	images: ImageNode \| List[ImageNode], # this also takes ImageDocument which inherits from ImageNode.
	formatted: bool = False,
	**kwargs: Any
	) -> CompletionResponse:
	raise NotImplementedError

	@llm_completion_callback()
	async def astream_complete(
	self, prompt: str, formatted: bool = False, **kwargs: Any
	) -> CompletionResponseGen:
	raise NotImplementedError

	@llm_chat_callback()
	async def achat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
	raise NotImplementedError

	@llm_chat_callback()
	async def astream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen:
	raise NotImplementedError


	# @st.cache_resource()
	# def get_multimodal_llm(**kwargs) -> MultiModalLLM:
	# vision_llm = OpenAIMultiModal(
	# model='gpt-4o-mini',
	# temperature=0,
	# max_new_tokens=512,
	# image_detail='auto'
	# )
	# return (vision_llm)

	@st.cache_resource
	def get_multimodal_llm(
	model_name: str = DEFAULT_HF_MULTIMODAL_LLM,
	device_map: str = "cuda", # does not support 'auto'
	processor_kwargs: dict[str, Any] \| None = None,
	model_kwargs: dict[str, Any] \| None = None, # {'torch_dtype': torch.bfloat16}, # {'torch_dtype': torch.float8_e5m2}
	generate_kwargs: dict[str, Any] \| None = None, # from the example cookbook

	hf_quant_level: int \| None = 8,
	) -> HuggingFaceMultiModalLLM:

	# Get default generate kwargs
	if model_kwargs is None:
	model_kwargs = {}
	if processor_kwargs is None:
	processor_kwargs = {}
	if generate_kwargs is None:
	generate_kwargs = {
	"temperature": sys.float_info.min,
	"top_p": None,
	"num_beams": 1
	# NOTE: we hack in EOSLogitProcessor in the HuggingFaceMultiModalLLM because it allows us to get the tokenizer.eos_token_id
	}

	# Get Quantization with Quanto
	quanto_config = None # NOTE: by default, no quantization.
	if (hf_quant_level == 4):
	# bnb_config = BitsAndBytesConfig(
	# # load_in_8bit=True,
	# load_in_4bit=True,
	# # bnb_4bit_use_double_quant=True,
	# bnb_4bit_quant_type="nf4",
	# bnb_4bit_compute_dtype='bfloat16', # NOTE: Tesla T4 GPUs are too crappy for bfloat16
	# # bnb_4bit_compute_dtype='float16'
	# )
	quanto_config = QuantoConfig(
	weights="int4" # there's also 'int2' if you're crazy...
	)
	elif (hf_quant_level == 8):
	# bnb_config = BitsAndBytesConfig(
	# load_in_8bit=True
	# )
	quanto_config = QuantoConfig(
	weights="int8"
	)

	if (quanto_config is not None):
	model_kwargs["quantization_config"] = quanto_config

	return HuggingFaceMultiModalLLM(
	model_name=model_name,
	device_map=device_map,
	processor_kwargs=processor_kwargs,
	model_kwargs=model_kwargs,
	generate_kwargs=generate_kwargs,

	max_new_tokens=1024 # from the example cookbook
	)