Taipei-1-training
Collection
15 items
•
Updated
import torch
from transformers import AutoModel, AutoProcessor, pipeline
import librosa
from PIL import Image
model_path = "ocisd4/multi-modal-llama-ocis"
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True, token="hf_tokens")
pipe = pipeline(model=model_path, trust_remote_code=True, processor=processor, device_map='auto')
audio, sr = librosa.load("/path/to/請問圖片中的景點是哪裡.wav", sr=16000)
image = Image.open("/path/to/台南孔廟.jpg")
turns = [
dict(
role='system',
content = "You are a travel expert who can accurately analyze the attractions in the pictures. All conversations should be conducted in Traditional Chinese.",
),
dict(
role='user',
content='<|image|><|begin_of_audio|><|audio|><|end_of_audio|>'
)
]
y_pred = pipe({'audio': [audio], 'images': [image], 'turns': turns, 'sampling_rate': sr}, max_new_tokens=300)
print(f"{y_pred}") # 這張照片中的景點是台灣的「台南孔廟」。...