--- library_name: transformers pipeline_tag: text-generation inference: true widget: - text: Hello! example_title: Hello world group: Python --- This model is for debugging. It is randomly initialized using the config from [Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct) but with smaller size. Codes: ```python import os from typing import Dict import requests import torch import transformers from PIL import Image from torchvision import io from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor, AutoTokenizer, GenerationConfig, Qwen2AudioForConditionalGeneration, pipeline, set_seed) model_id = "Qwen/Qwen2-Audio-7B-Instruct" repo_id = "yujiepan/qwen2-audio-tiny-random" save_path = f"/tmp/{repo_id}" config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) config.audio_config.encoder_layers = 2 config.audio_config.encoder_attention_heads = 2 config.audio_config.encoder_ffn_dim = 32 config.audio_config.d_model = 16 config.text_config.num_hidden_layers = 2 config.text_config.intermediate_size = 32 config.text_config.hidden_size = 16 config.text_config.num_attention_heads = 2 config.text_config.num_key_value_heads = 1 model = Qwen2AudioForConditionalGeneration(config=config) model = model.to(torch.bfloat16).cuda().eval() model.generation_config = GenerationConfig.from_pretrained( model_id, trust_remote_code=True, ) set_seed(42) with torch.no_grad(): for _, p in sorted(model.named_parameters()): torch.nn.init.uniform_(p, -0.3, 0.3) processor = AutoProcessor.from_pretrained(model_id) model.save_pretrained(save_path) processor.save_pretrained(save_path) os.system(f"ls -alh {save_path}") def try_inference(): from io import BytesIO from urllib.request import urlopen import librosa processor = AutoProcessor.from_pretrained(save_path) model = Qwen2AudioForConditionalGeneration.from_pretrained( save_path, device_map="auto") conversation = [ {"role": "user", "content": [ {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"}, ]}, {"role": "assistant", "content": "Yes, the speaker is female and in her twenties."}, {"role": "user", "content": [ {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"}, ]}, ] text = processor.apply_chat_template( conversation, add_generation_prompt=True, tokenize=False) audios = [] for message in conversation: if isinstance(message["content"], list): for ele in message["content"]: if ele["type"] == "audio": audios.append(librosa.load( BytesIO(urlopen(ele['audio_url']).read()), sr=processor.feature_extractor.sampling_rate)[0] ) inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True) inputs.input_ids = inputs.input_ids.to("cuda") generate_ids = model.generate(**inputs, max_length=256) generate_ids = generate_ids[:, inputs.input_ids.size(1):] response = processor.batch_decode( generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] print(response) try_inference() ```