File size: 12,866 Bytes
961640d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
" _torch_pytree._register_pytree_node(\n",
"/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
" _torch_pytree._register_pytree_node(\n",
"/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
" _torch_pytree._register_pytree_node(\n"
]
}
],
"source": [
"#STT (speech to text)\n",
"from transformers import WhisperProcessor, WhisperForConditionalGeneration\n",
"from transformers import pipeline\n",
"import librosa"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
]
}
],
"source": [
"# load model and processor for speech-to-text\n",
"processor = WhisperProcessor.from_pretrained(\"openai/whisper-small\")\n",
"modelw = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-small\")\n",
"# modelw.config.forced_decoder_ids = None"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"device = \"cuda\" if torch.cuda.is_available() else \"cpu\""
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
]
}
],
"source": [
"transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\", device=device)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"device(type='cpu')"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"transcriber.device"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"audio = \"/Users/sasan.jafarnejad/dev/uni/talking-car/audio1713724521.779008.wav\"\n",
"audio = \"/Users/sasan.jafarnejad/dev/uni/talking-car/audio/attenborough/neutral.wav\"\n",
"if type(audio) == str:\n",
" link_to_audio = audio\n",
" audio = torchaudio.load(link_to_audio)\n",
" audio = audio[0].squeeze().numpy(), sr\n",
" if len(audio[0].shape) == 2:\n",
" audio = audio[0].mean(axis=0), audio[1]"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"y, sr = audio"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"out = transcriber({\"sampling_rate\": sr, \"raw\":y})[\"text\"]"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\" If you speed up time, plants begin to reveal their true nature. They're not passive organisms, as you might think, but competitive creatures every bit as aggressive as animals. They're locked in a desperate battle for light and space. They stretch and pulse as they strive to barge their way into pole position. Creepers and vines reach around for the branch or stem of another plant on which to hitch a ride. This is an assassin bug. To us, it's easy enough to spot because it moves. To its prey, that's irrelevant, because it smells like one of their number. The assassin sucks its victims dry and blues their empty husks onto its back. This one is already carrying at least 20 corpses. Its irregular shape makes it hard for other predators to spot it and makes it virtually invisible to its prey, ants. It enters this ant colony unchallenged. Its coat of ant corpses masks its own odour. To the ants, it smells like one of their own, and that's what matters. They'll even run straight over the top of it. The assassin simply takes an ant whenever it feels hungry, and the body of each victim then adds to its disguise. The giants here too. This is the Moala Moala, the sunfish. It's huge, three meters across, and addicted to lying on its side at the surface. It eats vast quantities of jellyfish. And there are not only fish from me in these waters, there are mammals. sea lions, whose ancestors originally came from the coasts of California. The Galapagos plankton is so abundant it attracts some of the biggest of all ocean mammals, humpback whales, and rivalling them in size the biggest of all fish. 20 ton whale shark. Few parts of the world's oceans can equally as Galapagos waters for sheer variety and abundance. That creature was a penguin. Penguins are ocean-goings for moose. But a few thousand years ago some of them got caught in the cold waters of the Humboldt current and were carried northwards up the coast of South America and out to the Galapagos. They could hardly have found anywhere more different from their polar home and in and the response they chain, the Emperor penguin that lives near the South Pole stands over a metre high, Galapagos penguin is now only half, and that helps a lot in the Galapagos. Small animals lose heat much faster than big ones, and the penguins have developed behavioral tricks as well. Bear feet are easily sunburnt, so they do their best to keep them covered, and some parts of the sea around the islands are quite cool. The humbalt current flowing up from the Antarctic and washing around the western parts of the archipelago is still quite chilly. So most of the penguins stay in the channel between the two westernmost island and when things get really hot, they can still cool off with the swim. They're quick to detect the slightest variation in temperature and move around to find places where an eddy might have brought a pleasing chill. The arrival of penguins must be the most unlikely event in the whole story of the colonization of the Galapagos. The existence of creatures like these so far from the nearest continent poses many questions. How, for example, did these enormous beasts get to the islands in the first place? But perhaps the most extraordinary thing about the Galapagos tortoises is that they're not all the same. islands have different kinds. In the heyday there were 15 species. They seem to have appeared in an evolutionary blink of the eye. But will soon become a leaf. It's no ordinary. It has a special altogether more sinister. This is Nepenthes, the pitcher plant. It grows in nutrient poor soils, So has to find nitrogen and minerals in another way. The leaf, just like a flower, attracts insects with a reward. The pitcher is coloured and scented to appeal to flies looking for a meal of rotting flesh. The visitors are rewarded with a greasy substance on the underside of the pitcher's lid. But the plant wants something in return, not pollen, but a meal. The lip of the pitcher is covered in tiny slippery ridges. Wax lubricates the surface further. It's extremely difficult to hold on, even for a fly. Once inside, there's no escape. The leaf holds a pool of digestive liquid. This contains microscopic elastic filaments, which give it the properties of quicksand. The more the insect struggles, the deeper it sinks. Enzymes begin to dissolve the victim's body while it's still alive.\""
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"out"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def transcript(audio):\n",
" if type(audio) == str:\n",
" link_to_audio = audio\n",
" audio = torchaudio.load(link_to_audio)\n",
"\n",
" # We assume that the audio is the audio tensor\n",
"\n",
" # process the audio array\n",
" input_features = processor(audio_array, sampling_rate, return_tensors=\"pt\").input_features\n",
" predicted_ids = modelw.generate(input_features)\n",
"\n",
" transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
" \n",
" return audio_path, state['context'], state"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def transcript(general_context, link_to_audio, voice, place, time, delete_history, state):\n",
" \"\"\"this function manages speech-to-text to input Fnanswer function and text-to-speech with the Fnanswer output\"\"\"\n",
" # load audio from a specific path\n",
" audio_path = link_to_audio\n",
" audio_array, sampling_rate = librosa.load(link_to_audio, sr=16000) # \"sr=16000\" ensures that the sampling rate is as required\n",
"\n",
" # process the audio array\n",
" input_features = processor(audio_array, sampling_rate, return_tensors=\"pt\").input_features\n",
" predicted_ids = modelw.generate(input_features)\n",
"\n",
" transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
" \n",
"\n",
"\n",
" return audio_path, state['context'], state"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running on local URL: http://127.0.0.1:7875\n",
"\n",
"To create a public link, set `share=True` in `launch()`.\n"
]
},
{
"data": {
"text/html": [
"<div><iframe src=\"http://127.0.0.1:7875/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": []
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import numpy as np\n",
"import gradio as gr\n",
"import torchaudio\n",
"import time\n",
"import torch\n",
"\n",
"def save_audio_as_wav(data, sample_rate, file_path):\n",
" # make a tensor from the numpy array\n",
" data = torch.tensor(data).reshape(1, -1)\n",
" torchaudio.save(file_path, data, sample_rate=sample_rate, bits_per_sample=16, encoding=\"PCM_S\")\n",
"\n",
"def save_and_transcribe_audio(audio):\n",
" # capture the audio and save it to a file as wav or mp3\n",
" # file_name = save(\"audioinput.wav\")\n",
" sr, y = audio\n",
" # y = y.astype(np.float32)\n",
" # y /= np.max(np.abs(y))\n",
"\n",
" # add timestamp to file name\n",
" filename = f\"audio{time.time()}.wav\"\n",
" save_audio_as_wav(y, sr, filename)\n",
" \n",
" sr, y = audio\n",
" y = y.astype(np.float32)\n",
" y /= np.max(np.abs(y))\n",
" text = transcriber({\"sampling_rate\": sr, \"raw\":y})[\"text\"]\n",
" return text\n",
"\n",
"gr.Interface(\n",
" fn=save_and_transcribe_audio, \n",
" inputs=gr.Audio(sources=\"microphone\", type=\"numpy\", label=\"Record Audio\"), \n",
" outputs=\"text\").launch()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "llm",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|