In [1]:
import argparse
import os
import shutil
import random
from PIL import Image

import numpy as np
import torch
import torch.backends.cudnn as cudnn
from transformers import StoppingCriteria, StoppingCriteriaList

import lavis.tasks as tasks
from lavis.common.config import Config
from lavis.common.dist_utils import get_rank, init_distributed_mode
from lavis.common.logger import setup_logger
from lavis.common.optims import (
 LinearWarmupCosineLRScheduler,
 LinearWarmupStepLRScheduler,
)
from lavis.common.registry import registry
from lavis.common.utils import now

# imports modules for registration
from lavis.datasets.builders import *
from lavis.models import *
from lavis.processors import *
from lavis.runners import *
from lavis.tasks import *

In [None]:
shutil.copytree('/ibex/project/c2133/vicuna', '/tmp/vicuna')

In [2]:
class StoppingCriteriaSub(StoppingCriteria):

 def __init__(self, stops = [], encounters=1):
 super().__init__()
 self.stops = stops

 def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
 for stop in self.stops:
 if torch.all((stop == input_ids[0][-len(stop):])).item():
 return True

 return False


stop_words_ids = [torch.tensor([835]).to('cuda:0'), 
 torch.tensor([2277, 29937]).to('cuda:0')] # '###' can be encoded in different ways.
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])

In [6]:
parser = argparse.ArgumentParser(description="Training")

parser.add_argument("--cfg-path", required=True, help="path to configuration file.")
parser.add_argument(
 "--options",
 nargs="+",
 help="override some settings in the used config, the key-value pair "
 "in xxx=yyy format will be merged into config file (deprecate), "
 "change to --cfg-options instead.",
)

args = parser.parse_args(["--cfg-path", "lavis/projects/blip2/train/vicuna_pretrain_stage2_cc.yaml"])

cfg = Config(args)
device = 'cuda:0'

In [4]:
vis_processor_cfg = cfg.datasets_cfg.cc_combine.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)

In [7]:
task = tasks.setup_task(cfg)
model = task.build_model(cfg)

Loading LLAMA


[31m╭─[0m[31m──────────────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m───────────────────────────────[0m[31m─╮[0m
[31m│[0m in [92m[0m:[94m2[0m [31m│[0m
[31m│[0m [31m│[0m
[31m│[0m [2m1 [0mtask = tasks.setup_task(cfg) [31m│[0m
[31m│[0m [31m❱ [0m2 model = task.build_model(cfg) [31m│[0m
[31m│[0m [2m3 [0m [31m│[0m
[31m│[0m [31m│[0m
[31m│[0m [2;33m/home/zhud/project/blip2/lavis/tasks/[0m[1;33mbase_task.py[0m:[94m33[0m in [92mbuild_model[0m [31m│[0m
[31m│[0m [31m│[0m
[31m│[0m [2m 30 [0m[2m│ │ [0mmodel_config = cfg.model_cfg [31m│[0m
[31m│[0m [2m 31 [0m[2m│ │ [0m [31m│[0m
[31m│[0m [2m 32 [0m[2m│ │ [0mmodel_cls = registry.get_model_class(model_config.arch) [31m│[0m
[31m│[0m [31m❱ [0m 33 [2m│ │ [0m[94mreturn[0m model_cls.from_config(model_config) [31m│[0m
[31m│[0m [2m 34 [0m[2m│ [0m [31m│[0m
[31m│[0m [2m 35 [0m[2m│ [0m[94mdef[0m [92

'/ibex/project/c2133/vicuna'

### Load Checkpoint

In [None]:
ckpt_path = '/ibex/project/c2133/vicuna_ckpt_test/Vicuna_prompt_stage2_laion/20230410145/checkpoint_4.pth'
ckpt = torch.load(ckpt_path, map_location="cpu")
msg = model.load_state_dict(ckpt['model'], strict=False)
model = model.to(device)

### Example of Tokenizer

In [35]:
texts = ["A chat", "The assistant gives helpful"]

llama_tokens = model.llama_tokenizer(
 texts, 
 return_tensors="pt", 
 padding="longest",
 truncation=True,
 max_length=10).to(device)

In [13]:
texts = "The assistant gives helpful"

llama_tokens = model.llama_tokenizer(
 texts, 
 return_tensors="pt", 
 padding="longest",
 truncation=True,
 max_length=10).to(device)

In [14]:
llama_tokens.attention_mask.shape

torch.Size([1, 5])

In [9]:
targets = llama_tokens.input_ids.masked_fill(
 llama_tokens.input_ids == model.llama_tokenizer.pad_token_id, -100
 )

In [10]:
torch.ones([targets.shape[0], targets.shape[0]+1]).shape

torch.Size([2, 3])

In [None]:
text = \
"### Human: What's your name?" \
"### Assistant: "


llama_tokens = model.llama_tokenizer(
 text, 
 return_tensors="pt", 
 ).to(device)

### Example of Emb Input

In [188]:
inputs_embeds = model.llama_model.model.embed_tokens(llama_tokens.input_ids)
outputs = model.llama_model.generate(
 inputs_embeds=inputs_embeds,
 query_embeds=None,
 attention_mask=llama_tokens.attention_mask,
 max_new_tokens=500,
 stopping_criteria=stopping_criteria,
 )
output_text = model.llama_tokenizer.decode(outputs[0])
print(output_text)

​

I'm sorry, I am an AI language model and do not have a physical form or a name. My purpose is to assist you with any questions or tasks you may have to the best of my ability. Is there anything specific you would like help with?
###


In [189]:
inputs_embeds.shape

torch.Size([1, 16, 5120])

### Example of ID Input

In [None]:
outputs = model.llama_model.generate(
 input_ids=llama_tokens.input_ids,
 query_embeds=None,
 attention_mask=llama_tokens.attention_mask,
 max_new_tokens=500,
 stopping_criteria=stopping_criteria,
 )
output_text = model.llama_tokenizer.decode(outputs[0])
print(output_text)

### Example of Mixed Input

In [47]:
ckpt_path = '/home/zhud/project/blip2/lavis/output/BLIP2/Vicuna_pretrain_stage2_cc/20230408015/checkpoint_2.pth'
ckpt = torch.load(ckpt_path, map_location="cpu")
msg = model.load_state_dict(ckpt['model'], strict=False)
model = model.to(device)

In [48]:
# Load the image using PIL
image = Image.open('test_img5.jpg').convert('RGB')
image = vis_processor(image).unsqueeze(0).to(device)
inputs_llama, atts_llama = model.encode_img(image)

In [53]:
text = \
"A chat between a curious human and an artificial intelligence assistant. " \
"The assistant gives helpful, detailed, and polite answers to the human's questions. "\
"Human may ask questions related to a given image. " \
"The image will be wrapped as IMAGE_CONTENT " \
"### Human: To_Split " \
"### Assistant: Received the image. " \
"### Human: Describe the image in detail. Say everthing you see. Describe all the things." \
"### Assistant: "


text = \
"A chat between a curious human and an artificial intelligence assistant. " \
"The assistant gives helpful, detailed, and polite answers to the human's questions. "\
"Human may ask questions related to a given image. " \
"The image will be wrapped as IMAGE_CONTENT " \
"### Human: Describe the image in detail. Say everthing you see. To_Split " \
"### Assistant: "

text = \
"### Human: Describe the image in detail. Say everthing you see. To_Split " \
"### Assistant: "



# text = \
# "A chat between a curious human and an artificial intelligence assistant. " \
# "The assistant gives helpful, detailed, and polite answers to the human's questions. "\
# "Human may ask questions related to a given image. " \
# "The image will be wrapped as IMAGE_CONTENT " \
# "### Human: To_Split " \
# "### Assistant: Received the image. " \
# "### Human: This is a draft of a website. Give me the html code to write this website. " \
# "Btw, you need to come up with some jokes in the website to fill the placeholders. " \
# "Also, make the website colorful and vivid. " \
# "### Assistant: "


# text = \
# "Return what the human says. " \
# "### Human: There is a big elephant in the sky. " \
# "### Assistant: There is a big elephant in the sky. " \
# "### Human: fdjlks klcznv_l1 " \
# "### Assistant: fdjlks klcznv_l1 " \
# "### Human: To_Split " \
# "### Assistant: "


text_1, text_2 = text.split('To_Split')

text_1_tokens = model.llama_tokenizer(text_1, return_tensors="pt").to(device)
text_2_tokens = model.llama_tokenizer(text_2, return_tensors="pt", add_special_tokens=False).to(device)
text_1_emb = model.llama_model.model.embed_tokens(text_1_tokens.input_ids)
text_2_emb = model.llama_model.model.embed_tokens(text_2_tokens.input_ids)

In [54]:
outputs = model.llama_model.generate(
 inputs_embeds=torch.concat([text_1_emb, inputs_llama, text_2_emb], dim=1),
 query_embeds=None,
 attention_mask=torch.concat([text_1_tokens.attention_mask, atts_llama, text_2_tokens.attention_mask], dim=1),
 max_new_tokens=600,
 stopping_criteria=stopping_criteria,
 )
output_text = model.llama_tokenizer.decode(outputs[0])
print(output_text)



The image shows a small bird perched on a tree stump, with a camera lens in the background

The bird is a small bird, with a bright yellow beak and black feathers. It is perched on a tree stump, with its wings spread out and its beak open. The bird is looking to the left, as if it is about to take off.

The camera lens in the background is a large, black lens with a silver ring around the front. The lens is attached to a camera, which is not visible in the image. The lens is pointed at the bird, with the camera's viewfinder showing the bird in the center of the frame.

The background of the image is a forest, with trees and foliage visible in the distance. The trees are covered in leaves, and there is a thick layer of mist or fog in the air, which gives the image a dreamy, ethereal quality.

The lighting in the image is soft and diffused, with the sun shining through the trees and casting a warm, golden light on the bird and the tree stump. The lighting creates deep shadows in the fo

In [83]:
with open('lavis/prompts/image_caption.txt', 'r') as f:
 prompts = f.read().splitlines()

In [92]:
prompt_token = model.llama_tokenizer(prompts, return_tensors="pt", padding="longest",)

In [103]:


my_list = prompt_token.attention_mask.sum(1).numpy()
counts = {}

for element in my_list:
 if element in counts:
 counts[element] += 1
 else:
 counts[element] = 1

print(sorted(counts.items(), key=lambda item: item[0]))

[(15, 6), (16, 11), (17, 17), (18, 17), (19, 27), (20, 18), (21, 21), (22, 4), (23, 6), (24, 2)]


In [58]:
a,b = [1,1,1], [2,2,2]
c = [i for pair in zip(a,b) for i in pair]
print(c)

[1, 2, 1, 2, 1, 2]


### Example of Image Input

In [67]:
inputs_embeds = model.llama_model.model.embed_tokens(llama_tokens.input_ids)
bos_embeds = model.llama_model.model.embed_tokens(torch.tensor(model.llama_tokenizer.bos_token_id, device=device))[None, None]
outputs = model.llama_model.generate(
 inputs_embeds=torch.concat([bos_embeds, inputs_llama], dim=1),
 query_embeds=None,
 attention_mask=torch.concat([atts_llama[:, :1], atts_llama], dim=1),
 max_new_tokens=100,
 stopping_criteria=stopping_criteria,
 )
output_text = model.llama_tokenizer.decode(outputs[0])
print(output_text)

a bird eating from a bird feeder

bird feeder, bird feeder, bird feeder, bird feeder, bird feeder, bird feeder, bird
bird feeder, bird feeder, bird feeder, bird feeder, bird feeder, bird feeder, bird
bird feeder, bird feeder, bird feeder, bird feeder, bird feeder, bird feeder, bird
bird feeder, bird feeder, bird feeder
