Spaces:

Sijuade
/

MLM-CLIP-PHI-2-LLAVA-chatbot

Build error

App Files Files Community

MLM-CLIP-PHI-2-LLAVA-chatbot / config.py

Sijuade

Update config.py

00b53ff verified 9 months ago

raw

history blame

2.79 kB

	import peft
	import torch
	import whisperx
	import torch.nn as nn
	from transformers import AutoProcessor, AutoTokenizer
	from transformers import CLIPVisionModel, AutoModelForCausalLM


	class Projections(nn.Module):
	def __init__(
	self,
	clip_embed,
	phi_embed,
	num_projection_layers=6,
	):
	super().__init__()

	self.norm = nn.LayerNorm(phi_embed)
	self.output = nn.Linear(clip_embed, phi_embed)
	self.projection_layers = nn.ModuleList(
	[
	nn.Sequential(
	nn.Linear(phi_embed, phi_embed),
	nn.GELU(),
	nn.Linear(phi_embed, phi_embed),
	)
	for _ in range(num_projection_layers)
	]
	)

	def forward(self, x):
	x = self.output(x)
	self.norm(x)
	for layer in self.projection_layers:
	residual = x
	x = layer(x) + residual

	return x

	def load_projection_model(path, clip_embed, phi_embed):
	"""Loads a Projections model instance from a checkpoint and returns it with weights loaded.

	Args:
	path (str): Path to the checkpoint file.

	Returns:
	torch.nn.Module: The loaded Projections model instance.
	"""

	state_dict = torch.load(path)['state_dict']
	new_state_dict = {k.replace('projection.', ''): v for k, v in state_dict.items()}

	model = Projections(clip_embed, phi_embed)
	model.load_state_dict(new_state_dict)

	return model

	class Config:

	EOS_TOKEN_ID = 50256
	QUESTION_ANSWER_SEPARATOR_ID = 50295 # Special token ID for question-answer separation
	IMAGE_SEPARATOR_TOKENS = [685, 36259, 14041, 60, 220]

	phi_model_name = "microsoft/phi-2"
	model_name = "openai/clip-vit-base-patch32"
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	processor = AutoProcessor.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)

	projection = load_projection_model("models/MModalGPT-FINETUNE-step=29000-loss=3.45.ckpt", 768, 2560)

	clip_model = CLIPVisionModel.from_pretrained(model_name)
	audio_model = whisperx.load_model("small", device.type, compute_type="float16")

	text_model = AutoModelForCausalLM.from_pretrained(phi_model_name,
	torch_dtype=torch.float16,
	#device_map="cuda",
	low_cpu_mem_usage=True,
	return_dict=True,
	trust_remote_code=True)

	peft_model = peft.PeftModel.from_pretrained(text_model, 'models/29000')