oceansweep commited on
Commit
7b625ae
1 Parent(s): 74055ee

Update App_Function_Libraries/Audio/Audio_Transcription_Lib.py

Browse files
App_Function_Libraries/Audio/Audio_Transcription_Lib.py CHANGED
@@ -1,335 +1,284 @@
1
- # Audio_Transcription_Lib.py
2
- #########################################
3
- # Transcription Library
4
- # This library is used to perform transcription of audio files.
5
- # Currently, uses faster_whisper for transcription.
6
- #
7
- ####################
8
- # Function List
9
- #
10
- # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
- # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
- #
13
- ####################
14
- #
15
- # Import necessary libraries to run solo for testing
16
- import gc
17
- import json
18
- import logging
19
- import multiprocessing
20
- import os
21
- import queue
22
- import sys
23
- import subprocess
24
- import tempfile
25
- import threading
26
- import time
27
- # DEBUG Imports
28
- #from memory_profiler import profile
29
- import pyaudio
30
- from faster_whisper import WhisperModel as OriginalWhisperModel
31
- from typing import Optional, Union, List, Dict, Any
32
- #
33
- # Import Local
34
- from App_Function_Libraries.Utils.Utils import load_comprehensive_config
35
- from App_Function_Libraries.Metrics.metrics_logger import log_counter, log_histogram
36
- #
37
- #######################################################################################################################
38
- # Function Definitions
39
- #
40
-
41
- # Convert video .m4a into .wav using ffmpeg
42
- # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
43
- # https://www.gyan.dev/ffmpeg/builds/
44
- #
45
-
46
-
47
- whisper_model_instance = None
48
- config = load_comprehensive_config()
49
- processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
50
- total_thread_count = multiprocessing.cpu_count()
51
-
52
-
53
- class WhisperModel(OriginalWhisperModel):
54
- tldw_dir = os.path.dirname(os.path.dirname(__file__))
55
- default_download_root = os.path.join(tldw_dir, 'models', 'Whisper')
56
-
57
- valid_model_sizes = [
58
- "tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
59
- "large-v1", "large-v2", "large-v3", "large", "distil-large-v2", "distil-medium.en",
60
- "distil-small.en", "distil-large-v3",
61
- ]
62
-
63
- def __init__(
64
- self,
65
- model_size_or_path: str,
66
- device: str = processing_choice,
67
- device_index: Union[int, List[int]] = 0,
68
- compute_type: str = "default",
69
- cpu_threads: int = 0,#total_thread_count, FIXME - I think this should be 0
70
- num_workers: int = 1,
71
- download_root: Optional[str] = None,
72
- local_files_only: bool = False,
73
- files: Optional[Dict[str, Any]] = None,
74
- **model_kwargs: Any
75
- ):
76
- if download_root is None:
77
- download_root = self.default_download_root
78
-
79
- os.makedirs(download_root, exist_ok=True)
80
-
81
- # FIXME - validate....
82
- # Also write an integration test...
83
- # Check if model_size_or_path is a valid model size
84
- if model_size_or_path in self.valid_model_sizes:
85
- # It's a model size, so we'll use the download_root
86
- model_path = os.path.join(download_root, model_size_or_path)
87
- if not os.path.isdir(model_path):
88
- # If it doesn't exist, we'll let the parent class download it
89
- model_size_or_path = model_size_or_path # Keep the original model size
90
- else:
91
- # If it exists, use the full path
92
- model_size_or_path = model_path
93
- else:
94
- # It's not a valid model size, so assume it's a path
95
- model_size_or_path = os.path.abspath(model_size_or_path)
96
-
97
- super().__init__(
98
- model_size_or_path,
99
- device=device,
100
- device_index=device_index,
101
- compute_type=compute_type,
102
- cpu_threads=cpu_threads,
103
- num_workers=num_workers,
104
- download_root=download_root,
105
- local_files_only=local_files_only,
106
- # Maybe? idk, FIXME
107
- # files=files,
108
- # **model_kwargs
109
- )
110
-
111
- def get_whisper_model(model_name, device):
112
- global whisper_model_instance
113
- if whisper_model_instance is None:
114
- logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
115
- whisper_model_instance = WhisperModel(model_name, device=device)
116
- return whisper_model_instance
117
-
118
- # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
119
- #DEBUG
120
- #@profile
121
- def convert_to_wav(video_file_path, offset=0, overwrite=False):
122
- log_counter("convert_to_wav_attempt", labels={"file_path": video_file_path})
123
- start_time = time.time()
124
-
125
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
126
-
127
- if os.path.exists(out_path) and not overwrite:
128
- print(f"File '{out_path}' already exists. Skipping conversion.")
129
- logging.info(f"Skipping conversion as file already exists: {out_path}")
130
- log_counter("convert_to_wav_skipped", labels={"file_path": video_file_path})
131
- return out_path
132
-
133
- print("Starting conversion process of .m4a to .WAV")
134
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
135
-
136
- try:
137
- if os.name == "nt":
138
- logging.debug("ffmpeg being ran on windows")
139
-
140
- if sys.platform.startswith('win'):
141
- ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
142
- logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
143
- else:
144
- ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
145
-
146
- command = [
147
- ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
148
- "-ss", "00:00:00", # Start at the beginning of the video
149
- "-i", video_file_path,
150
- "-ar", "16000", # Audio sample rate
151
- "-ac", "1", # Number of audio channels
152
- "-c:a", "pcm_s16le", # Audio codec
153
- out_path
154
- ]
155
- try:
156
- # Redirect stdin from null device to prevent ffmpeg from waiting for input
157
- with open(os.devnull, 'rb') as null_file:
158
- result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
159
- if result.returncode == 0:
160
- logging.info("FFmpeg executed successfully")
161
- logging.debug("FFmpeg output: %s", result.stdout)
162
- else:
163
- logging.error("Error in running FFmpeg")
164
- logging.error("FFmpeg stderr: %s", result.stderr)
165
- raise RuntimeError(f"FFmpeg error: {result.stderr}")
166
- except Exception as e:
167
- logging.error("Error occurred - ffmpeg doesn't like windows")
168
- raise RuntimeError("ffmpeg failed")
169
- elif os.name == "posix":
170
- os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
171
- else:
172
- raise RuntimeError("Unsupported operating system")
173
- logging.info("Conversion to WAV completed: %s", out_path)
174
- log_counter("convert_to_wav_success", labels={"file_path": video_file_path})
175
- except Exception as e:
176
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
177
- log_counter("convert_to_wav_error", labels={"file_path": video_file_path, "error": str(e)})
178
- return {"error": str(e)}
179
-
180
- conversion_time = time.time() - start_time
181
- log_histogram("convert_to_wav_duration", conversion_time, labels={"file_path": video_file_path})
182
-
183
- gc.collect()
184
- return out_path
185
-
186
-
187
- # Transcribe .wav into .segments.json
188
- #DEBUG
189
- #@profile
190
- # FIXME - I feel like the `vad_filter` shoudl be enabled by default....
191
- def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
192
- log_counter("speech_to_text_attempt", labels={"file_path": audio_file_path, "model": whisper_model})
193
- time_start = time.time()
194
-
195
- if audio_file_path is None:
196
- log_counter("speech_to_text_error", labels={"error": "No audio file provided"})
197
- raise ValueError("speech-to-text: No audio file provided")
198
- logging.info("speech-to-text: Audio file path: %s", audio_file_path)
199
-
200
- try:
201
- _, file_ending = os.path.splitext(audio_file_path)
202
- out_file = audio_file_path.replace(file_ending, "-whisper_model-"+whisper_model+".segments.json")
203
- prettified_out_file = audio_file_path.replace(file_ending, "-whisper_model-"+whisper_model+".segments_pretty.json")
204
- if os.path.exists(out_file):
205
- logging.info("speech-to-text: Segments file already exists: %s", out_file)
206
- with open(out_file) as f:
207
- global segments
208
- segments = json.load(f)
209
- return segments
210
-
211
- logging.info('speech-to-text: Starting transcription...')
212
- # FIXME - revisit this
213
- options = dict(language=selected_source_lang, beam_size=10, best_of=10, vad_filter=vad_filter)
214
- transcribe_options = dict(task="transcribe", **options)
215
- # use function and config at top of file
216
- logging.debug("speech-to-text: Using whisper model: %s", whisper_model)
217
- whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
218
- # faster_whisper transcription right here - FIXME -test batching - ha
219
- segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
220
-
221
- segments = []
222
- for segment_chunk in segments_raw:
223
- chunk = {
224
- "Time_Start": segment_chunk.start,
225
- "Time_End": segment_chunk.end,
226
- "Text": segment_chunk.text
227
- }
228
- logging.debug("Segment: %s", chunk)
229
- segments.append(chunk)
230
- # Print to verify its working
231
- logging.info(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
232
-
233
- # Log it as well.
234
- logging.debug(
235
- f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
236
-
237
- if segments:
238
- segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
239
-
240
- if not segments:
241
- log_counter("speech_to_text_error", labels={"error": "No transcription produced"})
242
- raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
243
-
244
- transcription_time = time.time() - time_start
245
- logging.info("speech-to-text: Transcription completed in %.2f seconds", transcription_time)
246
- log_histogram("speech_to_text_duration", transcription_time, labels={"file_path": audio_file_path, "model": whisper_model})
247
- log_counter("speech_to_text_success", labels={"file_path": audio_file_path, "model": whisper_model})
248
- # Save the segments to a JSON file - prettified and non-prettified
249
- # FIXME refactor so this is an optional flag to save either the prettified json file or the normal one
250
- save_json = True
251
- if save_json:
252
- logging.info("speech-to-text: Saving segments to JSON file")
253
- output_data = {'segments': segments}
254
- logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
255
- with open(prettified_out_file, 'w') as f:
256
- json.dump(output_data, f, indent=2)
257
-
258
- logging.info("speech-to-text: Saving JSON to %s", out_file)
259
- with open(out_file, 'w') as f:
260
- json.dump(output_data, f)
261
-
262
- logging.debug(f"speech-to-text: returning {segments[:500]}")
263
- gc.collect()
264
- return segments
265
-
266
- except Exception as e:
267
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
268
- log_counter("speech_to_text_error", labels={"file_path": audio_file_path, "model": whisper_model, "error": str(e)})
269
- raise RuntimeError("speech-to-text: Error transcribing audio")
270
-
271
-
272
- def record_audio(duration, sample_rate=16000, chunk_size=1024):
273
- log_counter("record_audio_attempt", labels={"duration": duration})
274
- p = pyaudio.PyAudio()
275
- stream = p.open(format=pyaudio.paInt16,
276
- channels=1,
277
- rate=sample_rate,
278
- input=True,
279
- frames_per_buffer=chunk_size)
280
-
281
- print("Recording...")
282
- frames = []
283
- stop_recording = threading.Event()
284
- audio_queue = queue.Queue()
285
-
286
- def audio_callback():
287
- for _ in range(0, int(sample_rate / chunk_size * duration)):
288
- if stop_recording.is_set():
289
- break
290
- data = stream.read(chunk_size)
291
- audio_queue.put(data)
292
-
293
- audio_thread = threading.Thread(target=audio_callback)
294
- audio_thread.start()
295
-
296
- return p, stream, audio_queue, stop_recording, audio_thread
297
-
298
-
299
- def stop_recording(p, stream, audio_queue, stop_recording_event, audio_thread):
300
- log_counter("stop_recording_attempt")
301
- start_time = time.time()
302
- stop_recording_event.set()
303
- audio_thread.join()
304
-
305
- frames = []
306
- while not audio_queue.empty():
307
- frames.append(audio_queue.get())
308
-
309
- print("Recording finished.")
310
-
311
- stream.stop_stream()
312
- stream.close()
313
- p.terminate()
314
-
315
- stop_time = time.time() - start_time
316
- log_histogram("stop_recording_duration", stop_time)
317
- log_counter("stop_recording_success")
318
- return b''.join(frames)
319
-
320
- def save_audio_temp(audio_data, sample_rate=16000):
321
- log_counter("save_audio_temp_attempt")
322
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
323
- import wave
324
- wf = wave.open(temp_file.name, 'wb')
325
- wf.setnchannels(1)
326
- wf.setsampwidth(2)
327
- wf.setframerate(sample_rate)
328
- wf.writeframes(audio_data)
329
- wf.close()
330
- log_counter("save_audio_temp_success")
331
- return temp_file.name
332
-
333
- #
334
- #
335
  #######################################################################################################################
 
1
+ # Audio_Transcription_Lib.py
2
+ #########################################
3
+ # Transcription Library
4
+ # This library is used to perform transcription of audio files.
5
+ # Currently, uses faster_whisper for transcription.
6
+ #
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
+ # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
+ #
13
+ ####################
14
+ #
15
+ # Import necessary libraries to run solo for testing
16
+ import gc
17
+ import json
18
+ import logging
19
+ import multiprocessing
20
+ import os
21
+ import queue
22
+ import sys
23
+ import subprocess
24
+ import tempfile
25
+ import threading
26
+ import time
27
+ # DEBUG Imports
28
+ #from memory_profiler import profile
29
+ #import pyaudio
30
+ from faster_whisper import WhisperModel as OriginalWhisperModel
31
+ from typing import Optional, Union, List, Dict, Any
32
+ #
33
+ # Import Local
34
+ from App_Function_Libraries.Utils.Utils import load_comprehensive_config
35
+ from App_Function_Libraries.Metrics.metrics_logger import log_counter, log_histogram
36
+ #
37
+ #######################################################################################################################
38
+ # Function Definitions
39
+ #
40
+
41
+ # Convert video .m4a into .wav using ffmpeg
42
+ # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
43
+ # https://www.gyan.dev/ffmpeg/builds/
44
+ #
45
+
46
+
47
+ whisper_model_instance = None
48
+ config = load_comprehensive_config()
49
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
50
+ total_thread_count = multiprocessing.cpu_count()
51
+
52
+
53
+ class WhisperModel(OriginalWhisperModel):
54
+ tldw_dir = os.path.dirname(os.path.dirname(__file__))
55
+ default_download_root = os.path.join(tldw_dir, 'models', 'Whisper')
56
+
57
+ valid_model_sizes = [
58
+ "tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
59
+ "large-v1", "large-v2", "large-v3", "large", "distil-large-v2", "distil-medium.en",
60
+ "distil-small.en", "distil-large-v3",
61
+ ]
62
+
63
+ def __init__(
64
+ self,
65
+ model_size_or_path: str,
66
+ device: str = processing_choice,
67
+ device_index: Union[int, List[int]] = 0,
68
+ compute_type: str = "default",
69
+ cpu_threads: int = 0,#total_thread_count, FIXME - I think this should be 0
70
+ num_workers: int = 1,
71
+ download_root: Optional[str] = None,
72
+ local_files_only: bool = False,
73
+ files: Optional[Dict[str, Any]] = None,
74
+ **model_kwargs: Any
75
+ ):
76
+ if download_root is None:
77
+ download_root = self.default_download_root
78
+
79
+ os.makedirs(download_root, exist_ok=True)
80
+
81
+ # FIXME - validate....
82
+ # Also write an integration test...
83
+ # Check if model_size_or_path is a valid model size
84
+ if model_size_or_path in self.valid_model_sizes:
85
+ # It's a model size, so we'll use the download_root
86
+ model_path = os.path.join(download_root, model_size_or_path)
87
+ if not os.path.isdir(model_path):
88
+ # If it doesn't exist, we'll let the parent class download it
89
+ model_size_or_path = model_size_or_path # Keep the original model size
90
+ else:
91
+ # If it exists, use the full path
92
+ model_size_or_path = model_path
93
+ else:
94
+ # It's not a valid model size, so assume it's a path
95
+ model_size_or_path = os.path.abspath(model_size_or_path)
96
+
97
+ super().__init__(
98
+ model_size_or_path,
99
+ device=device,
100
+ device_index=device_index,
101
+ compute_type=compute_type,
102
+ cpu_threads=cpu_threads,
103
+ num_workers=num_workers,
104
+ download_root=download_root,
105
+ local_files_only=local_files_only,
106
+ # Maybe? idk, FIXME
107
+ # files=files,
108
+ # **model_kwargs
109
+ )
110
+
111
+ def get_whisper_model(model_name, device):
112
+ global whisper_model_instance
113
+ if whisper_model_instance is None:
114
+ logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
115
+ whisper_model_instance = WhisperModel(model_name, device=device)
116
+ return whisper_model_instance
117
+
118
+ # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
119
+ #DEBUG
120
+ #@profile
121
+ def convert_to_wav(video_file_path, offset=0, overwrite=False):
122
+ log_counter("convert_to_wav_attempt", labels={"file_path": video_file_path})
123
+ start_time = time.time()
124
+
125
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
126
+
127
+ if os.path.exists(out_path) and not overwrite:
128
+ print(f"File '{out_path}' already exists. Skipping conversion.")
129
+ logging.info(f"Skipping conversion as file already exists: {out_path}")
130
+ log_counter("convert_to_wav_skipped", labels={"file_path": video_file_path})
131
+ return out_path
132
+
133
+ print("Starting conversion process of .m4a to .WAV")
134
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
135
+
136
+ try:
137
+ if os.name == "nt":
138
+ logging.debug("ffmpeg being ran on windows")
139
+
140
+ if sys.platform.startswith('win'):
141
+ ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
142
+ logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
143
+ else:
144
+ ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
145
+
146
+ command = [
147
+ ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
148
+ "-ss", "00:00:00", # Start at the beginning of the video
149
+ "-i", video_file_path,
150
+ "-ar", "16000", # Audio sample rate
151
+ "-ac", "1", # Number of audio channels
152
+ "-c:a", "pcm_s16le", # Audio codec
153
+ out_path
154
+ ]
155
+ try:
156
+ # Redirect stdin from null device to prevent ffmpeg from waiting for input
157
+ with open(os.devnull, 'rb') as null_file:
158
+ result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
159
+ if result.returncode == 0:
160
+ logging.info("FFmpeg executed successfully")
161
+ logging.debug("FFmpeg output: %s", result.stdout)
162
+ else:
163
+ logging.error("Error in running FFmpeg")
164
+ logging.error("FFmpeg stderr: %s", result.stderr)
165
+ raise RuntimeError(f"FFmpeg error: {result.stderr}")
166
+ except Exception as e:
167
+ logging.error("Error occurred - ffmpeg doesn't like windows")
168
+ raise RuntimeError("ffmpeg failed")
169
+ elif os.name == "posix":
170
+ os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
171
+ else:
172
+ raise RuntimeError("Unsupported operating system")
173
+ logging.info("Conversion to WAV completed: %s", out_path)
174
+ log_counter("convert_to_wav_success", labels={"file_path": video_file_path})
175
+ except Exception as e:
176
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
177
+ log_counter("convert_to_wav_error", labels={"file_path": video_file_path, "error": str(e)})
178
+ return {"error": str(e)}
179
+
180
+ conversion_time = time.time() - start_time
181
+ log_histogram("convert_to_wav_duration", conversion_time, labels={"file_path": video_file_path})
182
+
183
+ gc.collect()
184
+ return out_path
185
+
186
+
187
+ # Transcribe .wav into .segments.json
188
+ #DEBUG
189
+ #@profile
190
+ # FIXME - I feel like the `vad_filter` shoudl be enabled by default....
191
+ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
192
+ log_counter("speech_to_text_attempt", labels={"file_path": audio_file_path, "model": whisper_model})
193
+ time_start = time.time()
194
+
195
+ if audio_file_path is None:
196
+ log_counter("speech_to_text_error", labels={"error": "No audio file provided"})
197
+ raise ValueError("speech-to-text: No audio file provided")
198
+ logging.info("speech-to-text: Audio file path: %s", audio_file_path)
199
+
200
+ try:
201
+ _, file_ending = os.path.splitext(audio_file_path)
202
+ out_file = audio_file_path.replace(file_ending, "-whisper_model-"+whisper_model+".segments.json")
203
+ prettified_out_file = audio_file_path.replace(file_ending, "-whisper_model-"+whisper_model+".segments_pretty.json")
204
+ if os.path.exists(out_file):
205
+ logging.info("speech-to-text: Segments file already exists: %s", out_file)
206
+ with open(out_file) as f:
207
+ global segments
208
+ segments = json.load(f)
209
+ return segments
210
+
211
+ logging.info('speech-to-text: Starting transcription...')
212
+ # FIXME - revisit this
213
+ options = dict(language=selected_source_lang, beam_size=10, best_of=10, vad_filter=vad_filter)
214
+ transcribe_options = dict(task="transcribe", **options)
215
+ # use function and config at top of file
216
+ logging.debug("speech-to-text: Using whisper model: %s", whisper_model)
217
+ whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
218
+ # faster_whisper transcription right here - FIXME -test batching - ha
219
+ segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
220
+
221
+ segments = []
222
+ for segment_chunk in segments_raw:
223
+ chunk = {
224
+ "Time_Start": segment_chunk.start,
225
+ "Time_End": segment_chunk.end,
226
+ "Text": segment_chunk.text
227
+ }
228
+ logging.debug("Segment: %s", chunk)
229
+ segments.append(chunk)
230
+ # Print to verify its working
231
+ logging.info(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
232
+
233
+ # Log it as well.
234
+ logging.debug(
235
+ f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
236
+
237
+ if segments:
238
+ segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
239
+
240
+ if not segments:
241
+ log_counter("speech_to_text_error", labels={"error": "No transcription produced"})
242
+ raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
243
+
244
+ transcription_time = time.time() - time_start
245
+ logging.info("speech-to-text: Transcription completed in %.2f seconds", transcription_time)
246
+ log_histogram("speech_to_text_duration", transcription_time, labels={"file_path": audio_file_path, "model": whisper_model})
247
+ log_counter("speech_to_text_success", labels={"file_path": audio_file_path, "model": whisper_model})
248
+ # Save the segments to a JSON file - prettified and non-prettified
249
+ # FIXME refactor so this is an optional flag to save either the prettified json file or the normal one
250
+ save_json = True
251
+ if save_json:
252
+ logging.info("speech-to-text: Saving segments to JSON file")
253
+ output_data = {'segments': segments}
254
+ logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
255
+ with open(prettified_out_file, 'w') as f:
256
+ json.dump(output_data, f, indent=2)
257
+
258
+ logging.info("speech-to-text: Saving JSON to %s", out_file)
259
+ with open(out_file, 'w') as f:
260
+ json.dump(output_data, f)
261
+
262
+ logging.debug(f"speech-to-text: returning {segments[:500]}")
263
+ gc.collect()
264
+ return segments
265
+
266
+ except Exception as e:
267
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
268
+ log_counter("speech_to_text_error", labels={"file_path": audio_file_path, "model": whisper_model, "error": str(e)})
269
+ raise RuntimeError("speech-to-text: Error transcribing audio")
270
+
271
+
272
+ def record_audio(duration, sample_rate=16000, chunk_size=1024):
273
+ pass
274
+
275
+
276
+ def stop_recording(p, stream, audio_queue, stop_recording_event, audio_thread):
277
+ pass
278
+
279
+ def save_audio_temp(audio_data, sample_rate=16000):
280
+ pass
281
+
282
+ #
283
+ #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  #######################################################################################################################