from transformers import VitsModel, AutoTokenizer import torch model = VitsModel.from_pretrained("SeyedAli/Arabic-Speech-synthesis") tokenizer = AutoTokenizer.from_pretrained("SeyedAli/Arabic-Speech-synthesis") text = "السلام عليكم ورحمة الله وبركاته" inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform