oceansweep commited on
Commit
13b4956
1 Parent(s): 311f5fa

Update App_Function_Libraries/Audio_Transcription_Lib.py

Browse files
App_Function_Libraries/Audio_Transcription_Lib.py CHANGED
@@ -1,329 +1,289 @@
1
- # Audio_Transcription_Lib.py
2
- #########################################
3
- # Transcription Library
4
- # This library is used to perform transcription of audio files.
5
- # Currently, uses faster_whisper for transcription.
6
- #
7
- ####################
8
- # Function List
9
- #
10
- # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
- # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
- #
13
- ####################
14
- #
15
- # Import necessary libraries to run solo for testing
16
- import gc
17
- import json
18
- import logging
19
- import os
20
- import queue
21
- import sys
22
- import subprocess
23
- import tempfile
24
- import threading
25
- import time
26
- # DEBUG Imports
27
- #from memory_profiler import profile
28
- import pyaudio
29
- from faster_whisper import WhisperModel as OriginalWhisperModel
30
- from typing import Optional, Union, List, Dict, Any
31
- #
32
- # Import Local
33
- from App_Function_Libraries.Utils.Utils import load_comprehensive_config
34
- #
35
- #######################################################################################################################
36
- # Function Definitions
37
- #
38
-
39
- # Convert video .m4a into .wav using ffmpeg
40
- # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
41
- # https://www.gyan.dev/ffmpeg/builds/
42
- #
43
-
44
-
45
- whisper_model_instance = None
46
- config = load_comprehensive_config()
47
- processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
48
-
49
-
50
-
51
- class WhisperModel(OriginalWhisperModel):
52
- tldw_dir = os.path.dirname(os.path.dirname(__file__))
53
- default_download_root = os.path.join(tldw_dir, 'App_Function_Libraries', 'models', 'Whisper')
54
-
55
- valid_model_sizes = [
56
- "tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
57
- "large-v1", "large-v2", "large-v3", "large", "distil-large-v2", "distil-medium.en",
58
- "distil-small.en", "distil-large-v3"
59
- ]
60
-
61
- def __init__(
62
- self,
63
- model_size_or_path: str,
64
- device: str = "auto",
65
- device_index: Union[int, List[int]] = 0,
66
- compute_type: str = "default",
67
- cpu_threads: int = 16,
68
- num_workers: int = 1,
69
- download_root: Optional[str] = None,
70
- local_files_only: bool = False,
71
- files: Optional[Dict[str, Any]] = None,
72
- **model_kwargs: Any
73
- ):
74
- if download_root is None:
75
- download_root = self.default_download_root
76
-
77
- os.makedirs(download_root, exist_ok=True)
78
-
79
- # FIXME - validate....
80
- # Also write an integration test...
81
- # Check if model_size_or_path is a valid model size
82
- if model_size_or_path in self.valid_model_sizes:
83
- # It's a model size, so we'll use the download_root
84
- model_path = os.path.join(download_root, model_size_or_path)
85
- if not os.path.isdir(model_path):
86
- # If it doesn't exist, we'll let the parent class download it
87
- model_size_or_path = model_size_or_path # Keep the original model size
88
- else:
89
- # If it exists, use the full path
90
- model_size_or_path = model_path
91
- else:
92
- # It's not a valid model size, so assume it's a path
93
- model_size_or_path = os.path.abspath(model_size_or_path)
94
-
95
- super().__init__(
96
- model_size_or_path,
97
- device=device,
98
- device_index=device_index,
99
- compute_type=compute_type,
100
- cpu_threads=cpu_threads,
101
- num_workers=num_workers,
102
- download_root=download_root,
103
- local_files_only=local_files_only,
104
- # Maybe? idk, FIXME
105
- # files=files,
106
- # **model_kwargs
107
- )
108
-
109
- def get_whisper_model(model_name, device):
110
- global whisper_model_instance
111
- if whisper_model_instance is None:
112
- logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
113
- whisper_model_instance = WhisperModel(model_name, device=device)
114
- return whisper_model_instance
115
-
116
- # # FIXME: This is a temporary solution.
117
- # # This doesn't clear older models, which means potentially a lot of memory is being used...
118
- # def get_whisper_model(model_name, device):
119
- # global whisper_model_instance
120
- # if whisper_model_instance is None:
121
- # from faster_whisper import WhisperModel
122
- # logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
123
- #
124
- # # FIXME - add logic to detect if the model is already downloaded
125
- # # want to first check if the model is already downloaded
126
- # # if not, download it using the existing logic in 'WhisperModel'
127
- # # https://github.com/SYSTRAN/faster-whisper/blob/d57c5b40b06e59ec44240d93485a95799548af50/faster_whisper/transcribe.py#L584
128
- # # Designated path should be `tldw/App_Function_Libraries/models/Whisper/`
129
- # WhisperModel.download_root = os.path.join(os.path.dirname(__file__), 'models', 'Whisper')
130
- # os.makedirs(WhisperModel.download_root, exist_ok=True)
131
- # whisper_model_instance = WhisperModel(model_name, device=device)
132
- # return whisper_model_instance
133
-
134
-
135
- # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
136
- #DEBUG
137
- #@profile
138
- def convert_to_wav(video_file_path, offset=0, overwrite=False):
139
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
140
-
141
- if os.path.exists(out_path) and not overwrite:
142
- print(f"File '{out_path}' already exists. Skipping conversion.")
143
- logging.info(f"Skipping conversion as file already exists: {out_path}")
144
- return out_path
145
- print("Starting conversion process of .m4a to .WAV")
146
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
147
-
148
- try:
149
- if os.name == "nt":
150
- logging.debug("ffmpeg being ran on windows")
151
-
152
- if sys.platform.startswith('win'):
153
- ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
154
- logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
155
- else:
156
- ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
157
-
158
- command = [
159
- ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
160
- "-ss", "00:00:00", # Start at the beginning of the video
161
- "-i", video_file_path,
162
- "-ar", "16000", # Audio sample rate
163
- "-ac", "1", # Number of audio channels
164
- "-c:a", "pcm_s16le", # Audio codec
165
- out_path
166
- ]
167
- try:
168
- # Redirect stdin from null device to prevent ffmpeg from waiting for input
169
- with open(os.devnull, 'rb') as null_file:
170
- result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
171
- if result.returncode == 0:
172
- logging.info("FFmpeg executed successfully")
173
- logging.debug("FFmpeg output: %s", result.stdout)
174
- else:
175
- logging.error("Error in running FFmpeg")
176
- logging.error("FFmpeg stderr: %s", result.stderr)
177
- raise RuntimeError(f"FFmpeg error: {result.stderr}")
178
- except Exception as e:
179
- logging.error("Error occurred - ffmpeg doesn't like windows")
180
- raise RuntimeError("ffmpeg failed")
181
- elif os.name == "posix":
182
- os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
183
- else:
184
- raise RuntimeError("Unsupported operating system")
185
- logging.info("Conversion to WAV completed: %s", out_path)
186
- except subprocess.CalledProcessError as e:
187
- logging.error("Error executing FFmpeg command: %s", str(e))
188
- raise RuntimeError("Error converting video file to WAV")
189
- except Exception as e:
190
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
191
- return {"error": str(e)}
192
- gc.collect()
193
- return out_path
194
-
195
-
196
- # Transcribe .wav into .segments.json
197
- #DEBUG
198
- #@profile
199
- def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
200
- global whisper_model_instance, processing_choice
201
- logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
202
-
203
- time_start = time.time()
204
- if audio_file_path is None:
205
- raise ValueError("speech-to-text: No audio file provided")
206
- logging.info("speech-to-text: Audio file path: %s", audio_file_path)
207
-
208
- try:
209
- _, file_ending = os.path.splitext(audio_file_path)
210
- out_file = audio_file_path.replace(file_ending, ".segments.json")
211
- prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
212
- if os.path.exists(out_file):
213
- logging.info("speech-to-text: Segments file already exists: %s", out_file)
214
- with open(out_file) as f:
215
- global segments
216
- segments = json.load(f)
217
- return segments
218
-
219
- logging.info('speech-to-text: Starting transcription...')
220
- options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
221
- transcribe_options = dict(task="transcribe", **options)
222
- # use function and config at top of file
223
- logging.debug("speech-to-text: Using whisper model: %s", whisper_model)
224
- whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
225
- segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
226
-
227
- segments = []
228
- for segment_chunk in segments_raw:
229
- chunk = {
230
- "Time_Start": segment_chunk.start,
231
- "Time_End": segment_chunk.end,
232
- "Text": segment_chunk.text
233
- }
234
- logging.debug("Segment: %s", chunk)
235
- segments.append(chunk)
236
- # Print to verify its working
237
- print(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
238
-
239
- # Log it as well.
240
- logging.debug(
241
- f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
242
-
243
- if segments:
244
- segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
245
-
246
- if not segments:
247
- raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
248
- logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
249
-
250
- # Save the segments to a JSON file - prettified and non-prettified
251
- # FIXME so this is an optional flag to save either the prettified json file or the normal one
252
- save_json = True
253
- if save_json:
254
- logging.info("speech-to-text: Saving segments to JSON file")
255
- output_data = {'segments': segments}
256
-
257
- logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
258
- with open(prettified_out_file, 'w') as f:
259
- json.dump(output_data, f, indent=2)
260
-
261
- logging.info("speech-to-text: Saving JSON to %s", out_file)
262
- with open(out_file, 'w') as f:
263
- json.dump(output_data, f)
264
-
265
- logging.debug(f"speech-to-text: returning {segments[:500]}")
266
- gc.collect()
267
- return segments
268
-
269
- except Exception as e:
270
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
271
- raise RuntimeError("speech-to-text: Error transcribing audio")
272
-
273
-
274
- def record_audio(duration, sample_rate=16000, chunk_size=1024):
275
- p = pyaudio.PyAudio()
276
- stream = p.open(format=pyaudio.paInt16,
277
- channels=1,
278
- rate=sample_rate,
279
- input=True,
280
- frames_per_buffer=chunk_size)
281
-
282
- print("Recording...")
283
- frames = []
284
- stop_recording = threading.Event()
285
- audio_queue = queue.Queue()
286
-
287
- def audio_callback():
288
- for _ in range(0, int(sample_rate / chunk_size * duration)):
289
- if stop_recording.is_set():
290
- break
291
- data = stream.read(chunk_size)
292
- audio_queue.put(data)
293
-
294
- audio_thread = threading.Thread(target=audio_callback)
295
- audio_thread.start()
296
-
297
- return p, stream, audio_queue, stop_recording, audio_thread
298
-
299
-
300
- def stop_recording(p, stream, audio_queue, stop_recording_event, audio_thread):
301
- stop_recording_event.set()
302
- audio_thread.join()
303
-
304
- frames = []
305
- while not audio_queue.empty():
306
- frames.append(audio_queue.get())
307
-
308
- print("Recording finished.")
309
-
310
- stream.stop_stream()
311
- stream.close()
312
- p.terminate()
313
-
314
- return b''.join(frames)
315
-
316
- def save_audio_temp(audio_data, sample_rate=16000):
317
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
318
- import wave
319
- wf = wave.open(temp_file.name, 'wb')
320
- wf.setnchannels(1)
321
- wf.setsampwidth(2)
322
- wf.setframerate(sample_rate)
323
- wf.writeframes(audio_data)
324
- wf.close()
325
- return temp_file.name
326
-
327
- #
328
- #
329
  #######################################################################################################################
 
1
+ # Audio_Transcription_Lib.py
2
+ #########################################
3
+ # Transcription Library
4
+ # This library is used to perform transcription of audio files.
5
+ # Currently, uses faster_whisper for transcription.
6
+ #
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
+ # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
+ #
13
+ ####################
14
+ #
15
+ # Import necessary libraries to run solo for testing
16
+ import gc
17
+ import json
18
+ import logging
19
+ import os
20
+ import queue
21
+ import sys
22
+ import subprocess
23
+ import tempfile
24
+ import threading
25
+ import time
26
+ # DEBUG Imports
27
+ #from memory_profiler import profile
28
+ #import pyaudio
29
+ from faster_whisper import WhisperModel as OriginalWhisperModel
30
+ from typing import Optional, Union, List, Dict, Any
31
+ #
32
+ # Import Local
33
+ from App_Function_Libraries.Utils.Utils import load_comprehensive_config
34
+ #
35
+ #######################################################################################################################
36
+ # Function Definitions
37
+ #
38
+
39
+ # Convert video .m4a into .wav using ffmpeg
40
+ # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
41
+ # https://www.gyan.dev/ffmpeg/builds/
42
+ #
43
+
44
+
45
+ whisper_model_instance = None
46
+ config = load_comprehensive_config()
47
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
48
+
49
+
50
+
51
+ class WhisperModel(OriginalWhisperModel):
52
+ tldw_dir = os.path.dirname(os.path.dirname(__file__))
53
+ default_download_root = os.path.join(tldw_dir, 'App_Function_Libraries', 'models', 'Whisper')
54
+
55
+ valid_model_sizes = [
56
+ "tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
57
+ "large-v1", "large-v2", "large-v3", "large", "distil-large-v2", "distil-medium.en",
58
+ "distil-small.en", "distil-large-v3"
59
+ ]
60
+
61
+ def __init__(
62
+ self,
63
+ model_size_or_path: str,
64
+ device: str = "auto",
65
+ device_index: Union[int, List[int]] = 0,
66
+ compute_type: str = "default",
67
+ cpu_threads: int = 16,
68
+ num_workers: int = 1,
69
+ download_root: Optional[str] = None,
70
+ local_files_only: bool = False,
71
+ files: Optional[Dict[str, Any]] = None,
72
+ **model_kwargs: Any
73
+ ):
74
+ if download_root is None:
75
+ download_root = self.default_download_root
76
+
77
+ os.makedirs(download_root, exist_ok=True)
78
+
79
+ # FIXME - validate....
80
+ # Also write an integration test...
81
+ # Check if model_size_or_path is a valid model size
82
+ if model_size_or_path in self.valid_model_sizes:
83
+ # It's a model size, so we'll use the download_root
84
+ model_path = os.path.join(download_root, model_size_or_path)
85
+ if not os.path.isdir(model_path):
86
+ # If it doesn't exist, we'll let the parent class download it
87
+ model_size_or_path = model_size_or_path # Keep the original model size
88
+ else:
89
+ # If it exists, use the full path
90
+ model_size_or_path = model_path
91
+ else:
92
+ # It's not a valid model size, so assume it's a path
93
+ model_size_or_path = os.path.abspath(model_size_or_path)
94
+
95
+ super().__init__(
96
+ model_size_or_path,
97
+ device=device,
98
+ device_index=device_index,
99
+ compute_type=compute_type,
100
+ cpu_threads=cpu_threads,
101
+ num_workers=num_workers,
102
+ download_root=download_root,
103
+ local_files_only=local_files_only,
104
+ # Maybe? idk, FIXME
105
+ # files=files,
106
+ # **model_kwargs
107
+ )
108
+
109
+ def get_whisper_model(model_name, device):
110
+ global whisper_model_instance
111
+ if whisper_model_instance is None:
112
+ logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
113
+ whisper_model_instance = WhisperModel(model_name, device=device)
114
+ return whisper_model_instance
115
+
116
+ # # FIXME: This is a temporary solution.
117
+ # # This doesn't clear older models, which means potentially a lot of memory is being used...
118
+ # def get_whisper_model(model_name, device):
119
+ # global whisper_model_instance
120
+ # if whisper_model_instance is None:
121
+ # from faster_whisper import WhisperModel
122
+ # logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
123
+ #
124
+ # # FIXME - add logic to detect if the model is already downloaded
125
+ # # want to first check if the model is already downloaded
126
+ # # if not, download it using the existing logic in 'WhisperModel'
127
+ # # https://github.com/SYSTRAN/faster-whisper/blob/d57c5b40b06e59ec44240d93485a95799548af50/faster_whisper/transcribe.py#L584
128
+ # # Designated path should be `tldw/App_Function_Libraries/models/Whisper/`
129
+ # WhisperModel.download_root = os.path.join(os.path.dirname(__file__), 'models', 'Whisper')
130
+ # os.makedirs(WhisperModel.download_root, exist_ok=True)
131
+ # whisper_model_instance = WhisperModel(model_name, device=device)
132
+ # return whisper_model_instance
133
+
134
+
135
+ # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
136
+ #DEBUG
137
+ #@profile
138
+ def convert_to_wav(video_file_path, offset=0, overwrite=False):
139
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
140
+
141
+ if os.path.exists(out_path) and not overwrite:
142
+ print(f"File '{out_path}' already exists. Skipping conversion.")
143
+ logging.info(f"Skipping conversion as file already exists: {out_path}")
144
+ return out_path
145
+ print("Starting conversion process of .m4a to .WAV")
146
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
147
+
148
+ try:
149
+ if os.name == "nt":
150
+ logging.debug("ffmpeg being ran on windows")
151
+
152
+ if sys.platform.startswith('win'):
153
+ ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
154
+ logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
155
+ else:
156
+ ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
157
+
158
+ command = [
159
+ ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
160
+ "-ss", "00:00:00", # Start at the beginning of the video
161
+ "-i", video_file_path,
162
+ "-ar", "16000", # Audio sample rate
163
+ "-ac", "1", # Number of audio channels
164
+ "-c:a", "pcm_s16le", # Audio codec
165
+ out_path
166
+ ]
167
+ try:
168
+ # Redirect stdin from null device to prevent ffmpeg from waiting for input
169
+ with open(os.devnull, 'rb') as null_file:
170
+ result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
171
+ if result.returncode == 0:
172
+ logging.info("FFmpeg executed successfully")
173
+ logging.debug("FFmpeg output: %s", result.stdout)
174
+ else:
175
+ logging.error("Error in running FFmpeg")
176
+ logging.error("FFmpeg stderr: %s", result.stderr)
177
+ raise RuntimeError(f"FFmpeg error: {result.stderr}")
178
+ except Exception as e:
179
+ logging.error("Error occurred - ffmpeg doesn't like windows")
180
+ raise RuntimeError("ffmpeg failed")
181
+ elif os.name == "posix":
182
+ os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
183
+ else:
184
+ raise RuntimeError("Unsupported operating system")
185
+ logging.info("Conversion to WAV completed: %s", out_path)
186
+ except subprocess.CalledProcessError as e:
187
+ logging.error("Error executing FFmpeg command: %s", str(e))
188
+ raise RuntimeError("Error converting video file to WAV")
189
+ except Exception as e:
190
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
191
+ return {"error": str(e)}
192
+ gc.collect()
193
+ return out_path
194
+
195
+
196
+ # Transcribe .wav into .segments.json
197
+ #DEBUG
198
+ #@profile
199
+ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
200
+ global whisper_model_instance, processing_choice
201
+ logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
202
+
203
+ time_start = time.time()
204
+ if audio_file_path is None:
205
+ raise ValueError("speech-to-text: No audio file provided")
206
+ logging.info("speech-to-text: Audio file path: %s", audio_file_path)
207
+
208
+ try:
209
+ _, file_ending = os.path.splitext(audio_file_path)
210
+ out_file = audio_file_path.replace(file_ending, ".segments.json")
211
+ prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
212
+ if os.path.exists(out_file):
213
+ logging.info("speech-to-text: Segments file already exists: %s", out_file)
214
+ with open(out_file) as f:
215
+ global segments
216
+ segments = json.load(f)
217
+ return segments
218
+
219
+ logging.info('speech-to-text: Starting transcription...')
220
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
221
+ transcribe_options = dict(task="transcribe", **options)
222
+ # use function and config at top of file
223
+ logging.debug("speech-to-text: Using whisper model: %s", whisper_model)
224
+ whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
225
+ segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
226
+
227
+ segments = []
228
+ for segment_chunk in segments_raw:
229
+ chunk = {
230
+ "Time_Start": segment_chunk.start,
231
+ "Time_End": segment_chunk.end,
232
+ "Text": segment_chunk.text
233
+ }
234
+ logging.debug("Segment: %s", chunk)
235
+ segments.append(chunk)
236
+ # Print to verify its working
237
+ print(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
238
+
239
+ # Log it as well.
240
+ logging.debug(
241
+ f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
242
+
243
+ if segments:
244
+ segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
245
+
246
+ if not segments:
247
+ raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
248
+ logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
249
+
250
+ # Save the segments to a JSON file - prettified and non-prettified
251
+ # FIXME so this is an optional flag to save either the prettified json file or the normal one
252
+ save_json = True
253
+ if save_json:
254
+ logging.info("speech-to-text: Saving segments to JSON file")
255
+ output_data = {'segments': segments}
256
+
257
+ logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
258
+ with open(prettified_out_file, 'w') as f:
259
+ json.dump(output_data, f, indent=2)
260
+
261
+ logging.info("speech-to-text: Saving JSON to %s", out_file)
262
+ with open(out_file, 'w') as f:
263
+ json.dump(output_data, f)
264
+
265
+ logging.debug(f"speech-to-text: returning {segments[:500]}")
266
+ gc.collect()
267
+ return segments
268
+
269
+ except Exception as e:
270
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
271
+ raise RuntimeError("speech-to-text: Error transcribing audio")
272
+
273
+
274
+ def record_audio(duration, sample_rate=16000, chunk_size=1024):
275
+ pass
276
+ def save_audio_temp(audio_data, sample_rate=16000):
277
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
278
+ import wave
279
+ wf = wave.open(temp_file.name, 'wb')
280
+ wf.setnchannels(1)
281
+ wf.setsampwidth(2)
282
+ wf.setframerate(sample_rate)
283
+ wf.writeframes(audio_data)
284
+ wf.close()
285
+ return temp_file.name
286
+
287
+ #
288
+ #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  #######################################################################################################################