diff --git "a/dev.ipynb" "b/dev.ipynb"
new file mode 100644--- /dev/null
+++ "b/dev.ipynb"
@@ -0,0 +1,283 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/hewliyang/speech-to-speech-translation/.venv/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n"
+ ]
+ }
+ ],
+ "source": [
+ "import torch\n",
+ "import librosa\n",
+ "from transformers import VitsModel, VitsTokenizer, pipeline\n",
+ "from IPython.display import Audio"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Using cuda:0 with dtype torch.float16\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Some weights of the model checkpoint at facebook/mms-tts-zlm were not used when initializing VitsModel: ['flow.flows.0.wavenet.in_layers.0.weight_g', 'flow.flows.0.wavenet.in_layers.0.weight_v', 'flow.flows.0.wavenet.in_layers.1.weight_g', 'flow.flows.0.wavenet.in_layers.1.weight_v', 'flow.flows.0.wavenet.in_layers.2.weight_g', 'flow.flows.0.wavenet.in_layers.2.weight_v', 'flow.flows.0.wavenet.in_layers.3.weight_g', 'flow.flows.0.wavenet.in_layers.3.weight_v', 'flow.flows.0.wavenet.res_skip_layers.0.weight_g', 'flow.flows.0.wavenet.res_skip_layers.0.weight_v', 'flow.flows.0.wavenet.res_skip_layers.1.weight_g', 'flow.flows.0.wavenet.res_skip_layers.1.weight_v', 'flow.flows.0.wavenet.res_skip_layers.2.weight_g', 'flow.flows.0.wavenet.res_skip_layers.2.weight_v', 'flow.flows.0.wavenet.res_skip_layers.3.weight_g', 'flow.flows.0.wavenet.res_skip_layers.3.weight_v', 'flow.flows.1.wavenet.in_layers.0.weight_g', 'flow.flows.1.wavenet.in_layers.0.weight_v', 'flow.flows.1.wavenet.in_layers.1.weight_g', 'flow.flows.1.wavenet.in_layers.1.weight_v', 'flow.flows.1.wavenet.in_layers.2.weight_g', 'flow.flows.1.wavenet.in_layers.2.weight_v', 'flow.flows.1.wavenet.in_layers.3.weight_g', 'flow.flows.1.wavenet.in_layers.3.weight_v', 'flow.flows.1.wavenet.res_skip_layers.0.weight_g', 'flow.flows.1.wavenet.res_skip_layers.0.weight_v', 'flow.flows.1.wavenet.res_skip_layers.1.weight_g', 'flow.flows.1.wavenet.res_skip_layers.1.weight_v', 'flow.flows.1.wavenet.res_skip_layers.2.weight_g', 'flow.flows.1.wavenet.res_skip_layers.2.weight_v', 'flow.flows.1.wavenet.res_skip_layers.3.weight_g', 'flow.flows.1.wavenet.res_skip_layers.3.weight_v', 'flow.flows.2.wavenet.in_layers.0.weight_g', 'flow.flows.2.wavenet.in_layers.0.weight_v', 'flow.flows.2.wavenet.in_layers.1.weight_g', 'flow.flows.2.wavenet.in_layers.1.weight_v', 'flow.flows.2.wavenet.in_layers.2.weight_g', 'flow.flows.2.wavenet.in_layers.2.weight_v', 'flow.flows.2.wavenet.in_layers.3.weight_g', 'flow.flows.2.wavenet.in_layers.3.weight_v', 'flow.flows.2.wavenet.res_skip_layers.0.weight_g', 'flow.flows.2.wavenet.res_skip_layers.0.weight_v', 'flow.flows.2.wavenet.res_skip_layers.1.weight_g', 'flow.flows.2.wavenet.res_skip_layers.1.weight_v', 'flow.flows.2.wavenet.res_skip_layers.2.weight_g', 'flow.flows.2.wavenet.res_skip_layers.2.weight_v', 'flow.flows.2.wavenet.res_skip_layers.3.weight_g', 'flow.flows.2.wavenet.res_skip_layers.3.weight_v', 'flow.flows.3.wavenet.in_layers.0.weight_g', 'flow.flows.3.wavenet.in_layers.0.weight_v', 'flow.flows.3.wavenet.in_layers.1.weight_g', 'flow.flows.3.wavenet.in_layers.1.weight_v', 'flow.flows.3.wavenet.in_layers.2.weight_g', 'flow.flows.3.wavenet.in_layers.2.weight_v', 'flow.flows.3.wavenet.in_layers.3.weight_g', 'flow.flows.3.wavenet.in_layers.3.weight_v', 'flow.flows.3.wavenet.res_skip_layers.0.weight_g', 'flow.flows.3.wavenet.res_skip_layers.0.weight_v', 'flow.flows.3.wavenet.res_skip_layers.1.weight_g', 'flow.flows.3.wavenet.res_skip_layers.1.weight_v', 'flow.flows.3.wavenet.res_skip_layers.2.weight_g', 'flow.flows.3.wavenet.res_skip_layers.2.weight_v', 'flow.flows.3.wavenet.res_skip_layers.3.weight_g', 'flow.flows.3.wavenet.res_skip_layers.3.weight_v', 'posterior_encoder.wavenet.in_layers.0.weight_g', 'posterior_encoder.wavenet.in_layers.0.weight_v', 'posterior_encoder.wavenet.in_layers.1.weight_g', 'posterior_encoder.wavenet.in_layers.1.weight_v', 'posterior_encoder.wavenet.in_layers.10.weight_g', 'posterior_encoder.wavenet.in_layers.10.weight_v', 'posterior_encoder.wavenet.in_layers.11.weight_g', 'posterior_encoder.wavenet.in_layers.11.weight_v', 'posterior_encoder.wavenet.in_layers.12.weight_g', 'posterior_encoder.wavenet.in_layers.12.weight_v', 'posterior_encoder.wavenet.in_layers.13.weight_g', 'posterior_encoder.wavenet.in_layers.13.weight_v', 'posterior_encoder.wavenet.in_layers.14.weight_g', 'posterior_encoder.wavenet.in_layers.14.weight_v', 'posterior_encoder.wavenet.in_layers.15.weight_g', 'posterior_encoder.wavenet.in_layers.15.weight_v', 'posterior_encoder.wavenet.in_layers.2.weight_g', 'posterior_encoder.wavenet.in_layers.2.weight_v', 'posterior_encoder.wavenet.in_layers.3.weight_g', 'posterior_encoder.wavenet.in_layers.3.weight_v', 'posterior_encoder.wavenet.in_layers.4.weight_g', 'posterior_encoder.wavenet.in_layers.4.weight_v', 'posterior_encoder.wavenet.in_layers.5.weight_g', 'posterior_encoder.wavenet.in_layers.5.weight_v', 'posterior_encoder.wavenet.in_layers.6.weight_g', 'posterior_encoder.wavenet.in_layers.6.weight_v', 'posterior_encoder.wavenet.in_layers.7.weight_g', 'posterior_encoder.wavenet.in_layers.7.weight_v', 'posterior_encoder.wavenet.in_layers.8.weight_g', 'posterior_encoder.wavenet.in_layers.8.weight_v', 'posterior_encoder.wavenet.in_layers.9.weight_g', 'posterior_encoder.wavenet.in_layers.9.weight_v', 'posterior_encoder.wavenet.res_skip_layers.0.weight_g', 'posterior_encoder.wavenet.res_skip_layers.0.weight_v', 'posterior_encoder.wavenet.res_skip_layers.1.weight_g', 'posterior_encoder.wavenet.res_skip_layers.1.weight_v', 'posterior_encoder.wavenet.res_skip_layers.10.weight_g', 'posterior_encoder.wavenet.res_skip_layers.10.weight_v', 'posterior_encoder.wavenet.res_skip_layers.11.weight_g', 'posterior_encoder.wavenet.res_skip_layers.11.weight_v', 'posterior_encoder.wavenet.res_skip_layers.12.weight_g', 'posterior_encoder.wavenet.res_skip_layers.12.weight_v', 'posterior_encoder.wavenet.res_skip_layers.13.weight_g', 'posterior_encoder.wavenet.res_skip_layers.13.weight_v', 'posterior_encoder.wavenet.res_skip_layers.14.weight_g', 'posterior_encoder.wavenet.res_skip_layers.14.weight_v', 'posterior_encoder.wavenet.res_skip_layers.15.weight_g', 'posterior_encoder.wavenet.res_skip_layers.15.weight_v', 'posterior_encoder.wavenet.res_skip_layers.2.weight_g', 'posterior_encoder.wavenet.res_skip_layers.2.weight_v', 'posterior_encoder.wavenet.res_skip_layers.3.weight_g', 'posterior_encoder.wavenet.res_skip_layers.3.weight_v', 'posterior_encoder.wavenet.res_skip_layers.4.weight_g', 'posterior_encoder.wavenet.res_skip_layers.4.weight_v', 'posterior_encoder.wavenet.res_skip_layers.5.weight_g', 'posterior_encoder.wavenet.res_skip_layers.5.weight_v', 'posterior_encoder.wavenet.res_skip_layers.6.weight_g', 'posterior_encoder.wavenet.res_skip_layers.6.weight_v', 'posterior_encoder.wavenet.res_skip_layers.7.weight_g', 'posterior_encoder.wavenet.res_skip_layers.7.weight_v', 'posterior_encoder.wavenet.res_skip_layers.8.weight_g', 'posterior_encoder.wavenet.res_skip_layers.8.weight_v', 'posterior_encoder.wavenet.res_skip_layers.9.weight_g', 'posterior_encoder.wavenet.res_skip_layers.9.weight_v']\n",
+ "- This IS expected if you are initializing VitsModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+ "- This IS NOT expected if you are initializing VitsModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+ "Some weights of VitsModel were not initialized from the model checkpoint at facebook/mms-tts-zlm and are newly initialized: ['flow.flows.0.wavenet.in_layers.0.parametrizations.weight.original0', 'flow.flows.0.wavenet.in_layers.0.parametrizations.weight.original1', 'flow.flows.0.wavenet.in_layers.1.parametrizations.weight.original0', 'flow.flows.0.wavenet.in_layers.1.parametrizations.weight.original1', 'flow.flows.0.wavenet.in_layers.2.parametrizations.weight.original0', 'flow.flows.0.wavenet.in_layers.2.parametrizations.weight.original1', 'flow.flows.0.wavenet.in_layers.3.parametrizations.weight.original0', 'flow.flows.0.wavenet.in_layers.3.parametrizations.weight.original1', 'flow.flows.0.wavenet.res_skip_layers.0.parametrizations.weight.original0', 'flow.flows.0.wavenet.res_skip_layers.0.parametrizations.weight.original1', 'flow.flows.0.wavenet.res_skip_layers.1.parametrizations.weight.original0', 'flow.flows.0.wavenet.res_skip_layers.1.parametrizations.weight.original1', 'flow.flows.0.wavenet.res_skip_layers.2.parametrizations.weight.original0', 'flow.flows.0.wavenet.res_skip_layers.2.parametrizations.weight.original1', 'flow.flows.0.wavenet.res_skip_layers.3.parametrizations.weight.original0', 'flow.flows.0.wavenet.res_skip_layers.3.parametrizations.weight.original1', 'flow.flows.1.wavenet.in_layers.0.parametrizations.weight.original0', 'flow.flows.1.wavenet.in_layers.0.parametrizations.weight.original1', 'flow.flows.1.wavenet.in_layers.1.parametrizations.weight.original0', 'flow.flows.1.wavenet.in_layers.1.parametrizations.weight.original1', 'flow.flows.1.wavenet.in_layers.2.parametrizations.weight.original0', 'flow.flows.1.wavenet.in_layers.2.parametrizations.weight.original1', 'flow.flows.1.wavenet.in_layers.3.parametrizations.weight.original0', 'flow.flows.1.wavenet.in_layers.3.parametrizations.weight.original1', 'flow.flows.1.wavenet.res_skip_layers.0.parametrizations.weight.original0', 'flow.flows.1.wavenet.res_skip_layers.0.parametrizations.weight.original1', 'flow.flows.1.wavenet.res_skip_layers.1.parametrizations.weight.original0', 'flow.flows.1.wavenet.res_skip_layers.1.parametrizations.weight.original1', 'flow.flows.1.wavenet.res_skip_layers.2.parametrizations.weight.original0', 'flow.flows.1.wavenet.res_skip_layers.2.parametrizations.weight.original1', 'flow.flows.1.wavenet.res_skip_layers.3.parametrizations.weight.original0', 'flow.flows.1.wavenet.res_skip_layers.3.parametrizations.weight.original1', 'flow.flows.2.wavenet.in_layers.0.parametrizations.weight.original0', 'flow.flows.2.wavenet.in_layers.0.parametrizations.weight.original1', 'flow.flows.2.wavenet.in_layers.1.parametrizations.weight.original0', 'flow.flows.2.wavenet.in_layers.1.parametrizations.weight.original1', 'flow.flows.2.wavenet.in_layers.2.parametrizations.weight.original0', 'flow.flows.2.wavenet.in_layers.2.parametrizations.weight.original1', 'flow.flows.2.wavenet.in_layers.3.parametrizations.weight.original0', 'flow.flows.2.wavenet.in_layers.3.parametrizations.weight.original1', 'flow.flows.2.wavenet.res_skip_layers.0.parametrizations.weight.original0', 'flow.flows.2.wavenet.res_skip_layers.0.parametrizations.weight.original1', 'flow.flows.2.wavenet.res_skip_layers.1.parametrizations.weight.original0', 'flow.flows.2.wavenet.res_skip_layers.1.parametrizations.weight.original1', 'flow.flows.2.wavenet.res_skip_layers.2.parametrizations.weight.original0', 'flow.flows.2.wavenet.res_skip_layers.2.parametrizations.weight.original1', 'flow.flows.2.wavenet.res_skip_layers.3.parametrizations.weight.original0', 'flow.flows.2.wavenet.res_skip_layers.3.parametrizations.weight.original1', 'flow.flows.3.wavenet.in_layers.0.parametrizations.weight.original0', 'flow.flows.3.wavenet.in_layers.0.parametrizations.weight.original1', 'flow.flows.3.wavenet.in_layers.1.parametrizations.weight.original0', 'flow.flows.3.wavenet.in_layers.1.parametrizations.weight.original1', 'flow.flows.3.wavenet.in_layers.2.parametrizations.weight.original0', 'flow.flows.3.wavenet.in_layers.2.parametrizations.weight.original1', 'flow.flows.3.wavenet.in_layers.3.parametrizations.weight.original0', 'flow.flows.3.wavenet.in_layers.3.parametrizations.weight.original1', 'flow.flows.3.wavenet.res_skip_layers.0.parametrizations.weight.original0', 'flow.flows.3.wavenet.res_skip_layers.0.parametrizations.weight.original1', 'flow.flows.3.wavenet.res_skip_layers.1.parametrizations.weight.original0', 'flow.flows.3.wavenet.res_skip_layers.1.parametrizations.weight.original1', 'flow.flows.3.wavenet.res_skip_layers.2.parametrizations.weight.original0', 'flow.flows.3.wavenet.res_skip_layers.2.parametrizations.weight.original1', 'flow.flows.3.wavenet.res_skip_layers.3.parametrizations.weight.original0', 'flow.flows.3.wavenet.res_skip_layers.3.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.0.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.0.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.1.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.1.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.10.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.10.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.11.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.11.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.12.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.12.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.13.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.13.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.14.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.14.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.15.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.15.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.2.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.2.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.3.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.3.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.4.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.4.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.5.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.5.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.6.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.6.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.7.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.7.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.8.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.8.parametrizations.weight.original1', 'posterior_encoder.wavenet.in_layers.9.parametrizations.weight.original0', 'posterior_encoder.wavenet.in_layers.9.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.0.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.0.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.1.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.1.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.10.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.10.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.11.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.11.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.12.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.12.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.13.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.13.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.14.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.14.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.15.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.15.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.2.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.2.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.3.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.3.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.4.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.4.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.5.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.5.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.6.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.6.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.7.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.7.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.8.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.8.parametrizations.weight.original1', 'posterior_encoder.wavenet.res_skip_layers.9.parametrizations.weight.original0', 'posterior_encoder.wavenet.res_skip_layers.9.parametrizations.weight.original1']\n",
+ "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+ "config.json: 100%|██████████| 1.27k/1.27k [00:00<00:00, 4.96MB/s]\n",
+ "model.safetensors: 100%|██████████| 3.09G/3.09G [03:04<00:00, 16.7MB/s]\n",
+ "generation_config.json: 100%|██████████| 3.94k/3.94k [00:00<00:00, 21.4MB/s]\n",
+ "tokenizer_config.json: 100%|██████████| 283k/283k [00:00<00:00, 6.43MB/s]\n",
+ "vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 15.4MB/s]\n",
+ "tokenizer.json: 100%|██████████| 2.48M/2.48M [00:00<00:00, 7.27MB/s]\n",
+ "merges.txt: 100%|██████████| 494k/494k [00:00<00:00, 16.5MB/s]\n",
+ "normalizer.json: 100%|██████████| 52.7k/52.7k [00:00<00:00, 223kB/s]\n",
+ "added_tokens.json: 100%|██████████| 34.6k/34.6k [00:00<00:00, 54.4MB/s]\n",
+ "special_tokens_map.json: 100%|██████████| 2.07k/2.07k [00:00<00:00, 10.3MB/s]\n",
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
+ "preprocessor_config.json: 100%|██████████| 340/340 [00:00<00:00, 1.69MB/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+ "torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n",
+ "print(f\"Using {device} with dtype {torch_dtype}\")\n",
+ "\n",
+ "model = VitsModel.from_pretrained(\"facebook/mms-tts-zlm\")\n",
+ "tokenizer = VitsTokenizer.from_pretrained(\"facebook/mms-tts-zlm\")\n",
+ "\n",
+ "asr_pipe = pipeline( # noqa: F821\n",
+ " \"automatic-speech-recognition\",\n",
+ " model=\"openai/whisper-large-v3\",\n",
+ " device=device,\n",
+ " torch_dtype=torch_dtype,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def synthesise(text):\n",
+ " inputs = tokenizer(text=text, return_tensors=\"pt\")\n",
+ " input_ids = inputs[\"input_ids\"]\n",
+ "\n",
+ " with torch.no_grad():\n",
+ " outputs = model(input_ids)\n",
+ "\n",
+ " speech = outputs[\"waveform\"]\n",
+ " return speech\n",
+ "\n",
+ "def translate(audio):\n",
+ " outputs = asr_pipe(\n",
+ " audio,\n",
+ " max_new_tokens=256,\n",
+ " generate_kwargs={\"task\": \"transcribe\", \"language\": \"ms\"},\n",
+ " )\n",
+ " return outputs[\"text\"]\n",
+ "\n",
+ "def speech_to_speech_translation(audio):\n",
+ " translated_text = translate(audio)\n",
+ " synthesised_speech = synthesise(translated_text)\n",
+ " synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)\n",
+ " return 16000, synthesised_speech"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "x, sr = librosa.load(\"audio.wav\", sr=16_000)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Audio(x, rate=sr)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Pada bab 16. Saya mungkin telah memberitahu anda tentang permulaan penyelidikan ini dalam beberapa lirik. Tetapi saya mahu anda melihat setiap langkah dengan mana kami datang. Saya juga setuju dengan apa-apa pun yang Marguerite mahukan.\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "text = translate(x)\n",
+ "print(text)\n",
+ "tts = synthesise(text)\n",
+ "\n",
+ "Audio(tts, rate=16_000)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[-8, -8, -5, ..., -4, -2, -3]], dtype=int16)"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import numpy as np\n",
+ "(tts.numpy() * 32767).astype(np.int16)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Audio((tts.numpy() * 32767).astype(np.int16), rate=16_000)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "sr, x = speech_to_speech_translation(x)\n",
+ "Audio(x, rate=sr)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.0"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}