File size: 12,866 Bytes
961640d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
      "  _torch_pytree._register_pytree_node(\n",
      "/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
      "  _torch_pytree._register_pytree_node(\n",
      "/opt/homebrew/Caskroom/miniconda/base/envs/llm/lib/python3.11/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.\n",
      "  _torch_pytree._register_pytree_node(\n"
     ]
    }
   ],
   "source": [
    "#STT (speech to text)\n",
    "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n",
    "from transformers import pipeline\n",
    "import librosa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "# load model and processor for speech-to-text\n",
    "processor = WhisperProcessor.from_pretrained(\"openai/whisper-small\")\n",
    "modelw = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-small\")\n",
    "# modelw.config.forced_decoder_ids = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
     ]
    }
   ],
   "source": [
    "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\", device=device)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "device(type='cpu')"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "transcriber.device"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "audio = \"/Users/sasan.jafarnejad/dev/uni/talking-car/audio1713724521.779008.wav\"\n",
    "audio = \"/Users/sasan.jafarnejad/dev/uni/talking-car/audio/attenborough/neutral.wav\"\n",
    "if type(audio) == str:\n",
    "    link_to_audio = audio\n",
    "    audio = torchaudio.load(link_to_audio)\n",
    "    audio = audio[0].squeeze().numpy(), sr\n",
    "    if len(audio[0].shape) == 2:\n",
    "        audio = audio[0].mean(axis=0), audio[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "y, sr = audio"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "out = transcriber({\"sampling_rate\": sr, \"raw\":y})[\"text\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\" If you speed up time, plants begin to reveal their true nature. They're not passive organisms, as you might think, but competitive creatures every bit as aggressive as animals. They're locked in a desperate battle for light and space. They stretch and pulse as they strive to barge their way into pole position. Creepers and vines reach around for the branch or stem of another plant on which to hitch a ride. This is an assassin bug. To us, it's easy enough to spot because it moves. To its prey, that's irrelevant, because it smells like one of their number. The assassin sucks its victims dry and blues their empty husks onto its back. This one is already carrying at least 20 corpses. Its irregular shape makes it hard for other predators to spot it and makes it virtually invisible to its prey, ants. It enters this ant colony unchallenged. Its coat of ant corpses masks its own odour. To the ants, it smells like one of their own, and that's what matters. They'll even run straight over the top of it. The assassin simply takes an ant whenever it feels hungry, and the body of each victim then adds to its disguise. The giants here too. This is the Moala Moala, the sunfish. It's huge, three meters across, and addicted to lying on its side at the surface. It eats vast quantities of jellyfish. And there are not only fish from me in these waters, there are mammals. sea lions, whose ancestors originally came from the coasts of California. The Galapagos plankton is so abundant it attracts some of the biggest of all ocean mammals, humpback whales, and rivalling them in size the biggest of all fish. 20 ton whale shark. Few parts of the world's oceans can equally as Galapagos waters for sheer variety and abundance. That creature was a penguin. Penguins are ocean-goings for moose. But a few thousand years ago some of them got caught in the cold waters of the Humboldt current and were carried northwards up the coast of South America and out to the Galapagos. They could hardly have found anywhere more different from their polar home and in and the response they chain, the Emperor penguin that lives near the South Pole stands over a metre high, Galapagos penguin is now only half, and that helps a lot in the Galapagos. Small animals lose heat much faster than big ones, and the penguins have developed behavioral tricks as well. Bear feet are easily sunburnt, so they do their best to keep them covered, and some parts of the sea around the islands are quite cool. The humbalt current flowing up from the Antarctic and washing around the western parts of the archipelago is still quite chilly. So most of the penguins stay in the channel between the two westernmost island and when things get really hot, they can still cool off with the swim. They're quick to detect the slightest variation in temperature and move around to find places where an eddy might have brought a pleasing chill. The arrival of penguins must be the most unlikely event in the whole story of the colonization of the Galapagos. The existence of creatures like these so far from the nearest continent poses many questions. How, for example, did these enormous beasts get to the islands in the first place? But perhaps the most extraordinary thing about the Galapagos tortoises is that they're not all the same. islands have different kinds. In the heyday there were 15 species. They seem to have appeared in an evolutionary blink of the eye. But will soon become a leaf. It's no ordinary. It has a special altogether more sinister. This is Nepenthes, the pitcher plant. It grows in nutrient poor soils, So has to find nitrogen and minerals in another way. The leaf, just like a flower, attracts insects with a reward. The pitcher is coloured and scented to appeal to flies looking for a meal of rotting flesh. The visitors are rewarded with a greasy substance on the underside of the pitcher's lid. But the plant wants something in return, not pollen, but a meal. The lip of the pitcher is covered in tiny slippery ridges. Wax lubricates the surface further. It's extremely difficult to hold on, even for a fly. Once inside, there's no escape. The leaf holds a pool of digestive liquid. This contains microscopic elastic filaments, which give it the properties of quicksand. The more the insect struggles, the deeper it sinks. Enzymes begin to dissolve the victim's body while it's still alive.\""
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def transcript(audio):\n",
    "    if type(audio) == str:\n",
    "        link_to_audio = audio\n",
    "        audio = torchaudio.load(link_to_audio)\n",
    "\n",
    "    # We assume that the audio is the audio tensor\n",
    "\n",
    "    # process the audio array\n",
    "    input_features = processor(audio_array, sampling_rate, return_tensors=\"pt\").input_features\n",
    "    predicted_ids = modelw.generate(input_features)\n",
    "\n",
    "    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
    "    \n",
    "    return audio_path, state['context'], state"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def transcript(general_context, link_to_audio, voice, place, time, delete_history, state):\n",
    "    \"\"\"this function manages speech-to-text to input Fnanswer function and text-to-speech with the Fnanswer output\"\"\"\n",
    "    # load audio from a specific path\n",
    "    audio_path = link_to_audio\n",
    "    audio_array, sampling_rate = librosa.load(link_to_audio, sr=16000)  # \"sr=16000\" ensures that the sampling rate is as required\n",
    "\n",
    "    # process the audio array\n",
    "    input_features = processor(audio_array, sampling_rate, return_tensors=\"pt\").input_features\n",
    "    predicted_ids = modelw.generate(input_features)\n",
    "\n",
    "    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
    "    \n",
    "\n",
    "\n",
    "    return audio_path, state['context'], state"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Running on local URL:  http://127.0.0.1:7875\n",
      "\n",
      "To create a public link, set `share=True` in `launch()`.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div><iframe src=\"http://127.0.0.1:7875/\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": []
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import gradio as gr\n",
    "import torchaudio\n",
    "import time\n",
    "import torch\n",
    "\n",
    "def save_audio_as_wav(data, sample_rate, file_path):\n",
    "    # make a tensor from the numpy array\n",
    "    data = torch.tensor(data).reshape(1, -1)\n",
    "    torchaudio.save(file_path, data, sample_rate=sample_rate, bits_per_sample=16, encoding=\"PCM_S\")\n",
    "\n",
    "def save_and_transcribe_audio(audio):\n",
    "    # capture the audio and save it to a file as wav or mp3\n",
    "    # file_name = save(\"audioinput.wav\")\n",
    "    sr, y = audio\n",
    "    # y = y.astype(np.float32)\n",
    "    # y /= np.max(np.abs(y))\n",
    "\n",
    "    # add timestamp to file name\n",
    "    filename = f\"audio{time.time()}.wav\"\n",
    "    save_audio_as_wav(y, sr, filename)\n",
    "    \n",
    "    sr, y = audio\n",
    "    y = y.astype(np.float32)\n",
    "    y /= np.max(np.abs(y))\n",
    "    text = transcriber({\"sampling_rate\": sr, \"raw\":y})[\"text\"]\n",
    "    return text\n",
    "\n",
    "gr.Interface(\n",
    "    fn=save_and_transcribe_audio, \n",
    "    inputs=gr.Audio(sources=\"microphone\", type=\"numpy\", label=\"Record Audio\"), \n",
    "    outputs=\"text\").launch()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}