{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`\n",
"INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.\n",
"Model was trained with torch 1.10.0+cu102, yours is 2.6.0. Bad things might happen unless you revert torch to 1.x.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached\n",
"INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached\n",
"INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached\n",
"INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached\n",
"INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached\n",
"INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder\n"
]
}
],
"source": [
"from pyannote.audio import Pipeline\n",
"\n",
"diarization_pipeline = Pipeline.from_pretrained(\n",
" \"pyannote/speaker-diarization\", \n",
" use_auth_token=\"\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"audio = \"audios/download_audio.mp3\""
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from IPython.display import Audio\n",
"\n",
"Audio(audio)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/macm1/miniforge3/envs/subtify/lib/python3.13/site-packages/torchaudio/_backend/soundfile_backend.py:71: UserWarning: The MPEG_LAYER_III subtype is unknown to TorchAudio. As a result, the bits_per_sample attribute will be set to 0. If you are seeing this warning, please report by opening an issue on github (after checking for existing/closed ones). You may otherwise ignore this warning.\n",
" warnings.warn(\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[27], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mdiarization_pipeline\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/pyannote/audio/core/pipeline.py:327\u001b[0m, in \u001b[0;36mPipeline.__call__\u001b[0;34m(self, file, **kwargs)\u001b[0m\n\u001b[1;32m 324\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpreprocessors\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[1;32m 325\u001b[0m file \u001b[38;5;241m=\u001b[39m ProtocolFile(file, lazy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpreprocessors)\n\u001b[0;32m--> 327\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/pyannote/audio/pipelines/speaker_diarization.py:523\u001b[0m, in \u001b[0;36mSpeakerDiarization.apply\u001b[0;34m(self, file, num_speakers, min_speakers, max_speakers, return_embeddings, hook)\u001b[0m\n\u001b[1;32m 520\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 522\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 523\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_embeddings\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 524\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 525\u001b[0m \u001b[43m \u001b[49m\u001b[43mbinarized_segmentations\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 526\u001b[0m \u001b[43m \u001b[49m\u001b[43mexclude_overlap\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding_exclude_overlap\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 527\u001b[0m \u001b[43m \u001b[49m\u001b[43mhook\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhook\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 528\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 529\u001b[0m hook(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124membeddings\u001b[39m\u001b[38;5;124m\"\u001b[39m, embeddings)\n\u001b[1;32m 530\u001b[0m \u001b[38;5;66;03m# shape: (num_chunks, local_num_speakers, dimension)\u001b[39;00m\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/pyannote/audio/pipelines/speaker_diarization.py:348\u001b[0m, in \u001b[0;36mSpeakerDiarization.get_embeddings\u001b[0;34m(self, file, binary_segmentations, exclude_overlap, hook)\u001b[0m\n\u001b[1;32m 345\u001b[0m mask_batch \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mvstack(masks)\n\u001b[1;32m 346\u001b[0m \u001b[38;5;66;03m# (batch_size, num_frames) torch.Tensor\u001b[39;00m\n\u001b[0;32m--> 348\u001b[0m embedding_batch: np\u001b[38;5;241m.\u001b[39mndarray \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_embedding\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 349\u001b[0m \u001b[43m \u001b[49m\u001b[43mwaveform_batch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmasks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmask_batch\u001b[49m\n\u001b[1;32m 350\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 351\u001b[0m \u001b[38;5;66;03m# (batch_size, dimension) np.ndarray\u001b[39;00m\n\u001b[1;32m 353\u001b[0m embedding_batches\u001b[38;5;241m.\u001b[39mappend(embedding_batch)\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/pyannote/audio/pipelines/speaker_verification.py:374\u001b[0m, in \u001b[0;36mSpeechBrainPretrainedSpeakerEmbedding.__call__\u001b[0;34m(self, waveforms, masks)\u001b[0m\n\u001b[1;32m 370\u001b[0m wav_lens \u001b[38;5;241m=\u001b[39m wav_lens \u001b[38;5;241m/\u001b[39m max_len\n\u001b[1;32m 371\u001b[0m wav_lens[too_short] \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1.0\u001b[39m\n\u001b[1;32m 373\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m--> 374\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclassifier_\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencode_batch\u001b[49m\u001b[43m(\u001b[49m\u001b[43msignals\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwav_lens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwav_lens\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 375\u001b[0m \u001b[38;5;241m.\u001b[39msqueeze(dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m 376\u001b[0m \u001b[38;5;241m.\u001b[39mcpu()\n\u001b[1;32m 377\u001b[0m \u001b[38;5;241m.\u001b[39mnumpy()\n\u001b[1;32m 378\u001b[0m )\n\u001b[1;32m 380\u001b[0m embeddings[too_short\u001b[38;5;241m.\u001b[39mcpu()\u001b[38;5;241m.\u001b[39mnumpy()] \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mnan\n\u001b[1;32m 382\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m embeddings\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/speechbrain/inference/classifiers.py:112\u001b[0m, in \u001b[0;36mEncoderClassifier.encode_batch\u001b[0;34m(self, wavs, wav_lens, normalize)\u001b[0m\n\u001b[1;32m 110\u001b[0m feats \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmods\u001b[38;5;241m.\u001b[39mcompute_features(wavs)\n\u001b[1;32m 111\u001b[0m feats \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmods\u001b[38;5;241m.\u001b[39mmean_var_norm(feats, wav_lens)\n\u001b[0;32m--> 112\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmods\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding_model\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfeats\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mwav_lens\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m normalize:\n\u001b[1;32m 114\u001b[0m embeddings \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhparams\u001b[38;5;241m.\u001b[39mmean_var_norm_emb(\n\u001b[1;32m 115\u001b[0m embeddings, torch\u001b[38;5;241m.\u001b[39mones(embeddings\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m], device\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m 116\u001b[0m )\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/speechbrain/lobes/models/ECAPA_TDNN.py:527\u001b[0m, in \u001b[0;36mECAPA_TDNN.forward\u001b[0;34m(self, x, lengths)\u001b[0m\n\u001b[1;32m 524\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmfa(x)\n\u001b[1;32m 526\u001b[0m \u001b[38;5;66;03m# Attentive Statistical Pooling\u001b[39;00m\n\u001b[0;32m--> 527\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43masp\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlengths\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlengths\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 528\u001b[0m x \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39masp_bn(x)\n\u001b[1;32m 530\u001b[0m \u001b[38;5;66;03m# Final linear transformation\u001b[39;00m\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/speechbrain/lobes/models/ECAPA_TDNN.py:280\u001b[0m, in \u001b[0;36mAttentiveStatisticsPooling.forward\u001b[0;34m(self, x, lengths)\u001b[0m\n\u001b[1;32m 277\u001b[0m attn \u001b[38;5;241m=\u001b[39m x\n\u001b[1;32m 279\u001b[0m \u001b[38;5;66;03m# Apply layers\u001b[39;00m\n\u001b[0;32m--> 280\u001b[0m attn \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconv(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtanh(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtdnn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mattn\u001b[49m\u001b[43m)\u001b[49m))\n\u001b[1;32m 282\u001b[0m \u001b[38;5;66;03m# Filter out zero-paddings\u001b[39;00m\n\u001b[1;32m 283\u001b[0m attn \u001b[38;5;241m=\u001b[39m attn\u001b[38;5;241m.\u001b[39mmasked_fill(mask \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m, \u001b[38;5;28mfloat\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m-inf\u001b[39m\u001b[38;5;124m\"\u001b[39m))\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/speechbrain/lobes/models/ECAPA_TDNN.py:81\u001b[0m, in \u001b[0;36mTDNNBlock.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 79\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, x):\n\u001b[1;32m 80\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Processes the input tensor x and returns an output tensor.\"\"\"\u001b[39;00m\n\u001b[0;32m---> 81\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnorm(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mactivation(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m))\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/speechbrain/nnet/CNN.py:455\u001b[0m, in \u001b[0;36mConv1d.forward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 449\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 450\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 451\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPadding must be \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msame\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m, \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mvalid\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m or \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcausal\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m. Got \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 452\u001b[0m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpadding\n\u001b[1;32m 453\u001b[0m )\n\u001b[0;32m--> 455\u001b[0m wx \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mx\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 457\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39munsqueeze:\n\u001b[1;32m 458\u001b[0m wx \u001b[38;5;241m=\u001b[39m wx\u001b[38;5;241m.\u001b[39msqueeze(\u001b[38;5;241m1\u001b[39m)\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/torch/nn/modules/module.py:1739\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1738\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1739\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/torch/nn/modules/module.py:1750\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1747\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1748\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1749\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1750\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1752\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1753\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/torch/nn/modules/conv.py:375\u001b[0m, in \u001b[0;36mConv1d.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m 374\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 375\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_conv_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbias\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/miniforge3/envs/subtify/lib/python3.13/site-packages/torch/nn/modules/conv.py:370\u001b[0m, in \u001b[0;36mConv1d._conv_forward\u001b[0;34m(self, input, weight, bias)\u001b[0m\n\u001b[1;32m 358\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpadding_mode \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mzeros\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 359\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m F\u001b[38;5;241m.\u001b[39mconv1d(\n\u001b[1;32m 360\u001b[0m F\u001b[38;5;241m.\u001b[39mpad(\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28minput\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reversed_padding_repeated_twice, mode\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mpadding_mode\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 368\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgroups,\n\u001b[1;32m 369\u001b[0m )\n\u001b[0;32m--> 370\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconv1d\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 371\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbias\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstride\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpadding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdilation\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroups\u001b[49m\n\u001b[1;32m 372\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"outputs = diarization_pipeline(audio)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"outputs.for_json()[\"content\"]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"model_id = \"openai/whisper-large-v3-turbo\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n"
]
}
],
"source": [
"model = AutoModelForSpeechSeq2Seq.from_pretrained(\n",
" model_id, \n",
" torch_dtype=torch_dtype, \n",
" low_cpu_mem_usage=True, \n",
" use_safetensors=True\n",
")\n",
"model.to(device)\n",
"print()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"processor = AutoProcessor.from_pretrained(model_id)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Device set to use cpu\n"
]
}
],
"source": [
"pipe = pipeline(\n",
" \"automatic-speech-recognition\",\n",
" model=model,\n",
" feature_extractor=processor.feature_extractor,\n",
" tokenizer=processor.tokenizer,\n",
" torch_dtype=torch_dtype,\n",
" device=device,\n",
" chunk_length_s=30,\n",
" stride_length_s=5\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/macm1/miniforge3/envs/subtify/lib/python3.13/site-packages/transformers/models/whisper/generation_whisper.py:573: FutureWarning: The input name `inputs` is deprecated. Please make sure to use `input_features` instead.\n",
" warnings.warn(\n",
"You have passed language=english, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=english.\n",
"The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n"
]
}
],
"source": [
"result = pipe(audio, return_timestamps=True, generate_kwargs={\"language\": \"english\"})"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'text': \" Looking for straightforward solutions to a Google Search question? Or maybe you just want some inspiration on improving your presence on Search? Well, we've got you coming. At Search Central, we're the one-stop shop for all your search needs, brought to you by the folks behind, well, Google Search. You'll receive all the latest information, as it happens, from simple step-by-step tutorials. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. SPEAKER 1. Thank you. get content. Clear technical advice. The organic Google Search Traffic Dashboards. Whoa. TO FUN Explainer videos. Google's main crawler is called Googlebot. Hello, and welcome to another episode of Search Off the Record. All this and more to help you get your site and its findability in tip-top shape without all the fuss. So if you're looking for trusted weekly content without the long-winded lingo, subscribe to the Search Central YouTube channel today.\",\n",
" 'chunks': [{'timestamp': (0.8, 25.82),\n",
" 'text': \" Looking for straightforward solutions to a Google Search question? Or maybe you just want some inspiration on improving your presence on Search? Well, we've got you coming. At Search Central, we're the one-stop shop for all your search needs, brought to you by the folks behind, well, Google Search. You'll receive all the latest information, as it happens,\"},\n",
" {'timestamp': (25.82, 28.68),\n",
" 'text': ' from simple step-by-step tutorials.'},\n",
" {'timestamp': (28.68, 28.7), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (28.7, 28.72), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (28.72, 29.26), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (29.26, 29.86), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (29.86, 30.44), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (30.44, 30.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (30.94, 31.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (31.94, 32.44), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (32.44, 32.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (32.94, 33.44), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (33.44, 33.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (33.94, 34.44), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (34.44, 34.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (34.94, 35.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (35.94, 36.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (36.94, 37.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (37.94, 38.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (38.94, 39.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (39.94, 40.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (40.94, 41.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (41.94, 42.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (42.94, 43.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (43.94, 44.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (44.94, 45.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (45.94, 46.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (46.94, 47.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (47.94, 48.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (48.94, 49.94), 'text': ' SPEAKER 1.'},\n",
" {'timestamp': (49.94, 20.0), 'text': ''},\n",
" {'timestamp': (67.4, 70.22),\n",
" 'text': \" Thank you. get content. Clear technical advice. The organic Google Search Traffic Dashboards. Whoa. TO FUN Explainer videos. Google's main crawler is called Googlebot. Hello, and welcome to another episode of Search Off the Record. All this and more to help you get your site and its findability in tip-top shape without all the fuss. So if you're looking for trusted weekly content without the long-winded lingo,\"},\n",
" {'timestamp': (70.22, 73.12),\n",
" 'text': ' subscribe to the Search Central YouTube channel today.'}]}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "subtify",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}