--- library_name: transformers pipeline_tag: image-text-to-text base_model: Qwen/Qwen2-VL-72B tags: - chat inference: true widget: - text: Hello! example_title: Hello world group: Python --- This model is for debugging. It is randomly initialized with the config from [Qwen/QVQ-72B-Preview](https://huggingface.co/Qwen/QVQ-72B-Preview) but is of smaller size. Codes: ```python import os from typing import Dict import requests import torch import transformers from PIL import Image from torchvision import io from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer, GenerationConfig, enable_full_determinism, pipeline, set_seed) from transformers.models.qwen2_vl import Qwen2VLForConditionalGeneration model_id = "Qwen/QVQ-72B-Preview" repo_id = "yujiepan/qvq-preview-tiny-random" save_path = f"/tmp/{repo_id}" config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) config.hidden_size = 16 config.intermediate_size = 32 config.num_attention_heads = 2 config.num_hidden_layers = 2 config.num_key_value_heads = 1 config.vision_config.embed_dim = 16 config.vision_config.num_heads = 2 config.vision_config.hidden_size = 16 config.vision_config.depth = 2 config.rope_scaling['mrope_section'] = [1, 1, 2] # sum needs to be 4 here enable_full_determinism(42) model = Qwen2VLForConditionalGeneration(config=config) model = model.to(torch.bfloat16).cuda().eval() model.generation_config = GenerationConfig.from_pretrained( model_id, trust_remote_code=True, ) processor = AutoProcessor.from_pretrained(model_id) model.save_pretrained(save_path) processor.save_pretrained(save_path) os.system(f"ls -alh {save_path}") def try_inference(model_id): torch.use_deterministic_algorithms(False) from qwen_vl_utils import process_vision_info from transformers import (AutoProcessor, AutoTokenizer, Qwen2VLForConditionalGeneration) model = Qwen2VLForConditionalGeneration.from_pretrained( model_id, device_map="cuda" ) processor = AutoProcessor.from_pretrained(model_id) messages = [ { "role": "system", "content": [ {"type": "text", "text": "You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step."} ], }, { "role": "user", "content": [ { "type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/QVQ/demo.png", }, {"type": "text", "text": "What value should be filled in the blank space?"}, ], } ] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) inputs = inputs.to("cuda") generated_ids = model.generate(**inputs, max_new_tokens=32) output_text = processor.batch_decode( generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False ) print(output_text) try_inference(save_path) ```