oceansweep commited on
Commit
63e8a17
·
verified ·
1 Parent(s): 429541f

Update App_Function_Libraries/Audio_Transcription_Lib.py

Browse files
App_Function_Libraries/Audio_Transcription_Lib.py CHANGED
@@ -1,247 +1,247 @@
1
- # Audio_Transcription_Lib.py
2
- #########################################
3
- # Transcription Library
4
- # This library is used to perform transcription of audio files.
5
- # Currently, uses faster_whisper for transcription.
6
- #
7
- ####################
8
- # Function List
9
- #
10
- # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
- # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
- #
13
- ####################
14
- #
15
- # Import necessary libraries to run solo for testing
16
- import gc
17
- import json
18
- import logging
19
- import os
20
- import queue
21
- import sys
22
- import subprocess
23
- import tempfile
24
- import threading
25
- import time
26
- import configparser
27
- # DEBUG Imports
28
- #from memory_profiler import profile
29
- import pyaudio
30
- # Import Local
31
- #
32
- #######################################################################################################################
33
- # Function Definitions
34
- #
35
-
36
- # Convert video .m4a into .wav using ffmpeg
37
- # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
38
- # https://www.gyan.dev/ffmpeg/builds/
39
- #
40
-
41
-
42
- whisper_model_instance = None
43
- # Retrieve processing choice from the configuration file
44
- config = configparser.ConfigParser()
45
- config.read('config.txt')
46
- processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
47
-
48
-
49
- # FIXME: This is a temporary solution.
50
- # This doesn't clear older models, which means potentially a lot of memory is being used...
51
- def get_whisper_model(model_name, device):
52
- global whisper_model_instance
53
- if whisper_model_instance is None:
54
- from faster_whisper import WhisperModel
55
- logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
56
- whisper_model_instance = WhisperModel(model_name, device=device)
57
- return whisper_model_instance
58
-
59
-
60
- # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
61
- #DEBUG
62
- #@profile
63
- def convert_to_wav(video_file_path, offset=0, overwrite=False):
64
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
65
-
66
- if os.path.exists(out_path) and not overwrite:
67
- print(f"File '{out_path}' already exists. Skipping conversion.")
68
- logging.info(f"Skipping conversion as file already exists: {out_path}")
69
- return out_path
70
- print("Starting conversion process of .m4a to .WAV")
71
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
72
-
73
- try:
74
- if os.name == "nt":
75
- logging.debug("ffmpeg being ran on windows")
76
-
77
- if sys.platform.startswith('win'):
78
- ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
79
- logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
80
- else:
81
- ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
82
-
83
- command = [
84
- ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
85
- "-ss", "00:00:00", # Start at the beginning of the video
86
- "-i", video_file_path,
87
- "-ar", "16000", # Audio sample rate
88
- "-ac", "1", # Number of audio channels
89
- "-c:a", "pcm_s16le", # Audio codec
90
- out_path
91
- ]
92
- try:
93
- # Redirect stdin from null device to prevent ffmpeg from waiting for input
94
- with open(os.devnull, 'rb') as null_file:
95
- result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
96
- if result.returncode == 0:
97
- logging.info("FFmpeg executed successfully")
98
- logging.debug("FFmpeg output: %s", result.stdout)
99
- else:
100
- logging.error("Error in running FFmpeg")
101
- logging.error("FFmpeg stderr: %s", result.stderr)
102
- raise RuntimeError(f"FFmpeg error: {result.stderr}")
103
- except Exception as e:
104
- logging.error("Error occurred - ffmpeg doesn't like windows")
105
- raise RuntimeError("ffmpeg failed")
106
- elif os.name == "posix":
107
- os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
108
- else:
109
- raise RuntimeError("Unsupported operating system")
110
- logging.info("Conversion to WAV completed: %s", out_path)
111
- except subprocess.CalledProcessError as e:
112
- logging.error("Error executing FFmpeg command: %s", str(e))
113
- raise RuntimeError("Error converting video file to WAV")
114
- except Exception as e:
115
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
116
- return {"error": str(e)}
117
- gc.collect()
118
- return out_path
119
-
120
-
121
- # Transcribe .wav into .segments.json
122
- #DEBUG
123
- #@profile
124
- def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
125
- global whisper_model_instance, processing_choice
126
- logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
127
-
128
- time_start = time.time()
129
- if audio_file_path is None:
130
- raise ValueError("speech-to-text: No audio file provided")
131
- logging.info("speech-to-text: Audio file path: %s", audio_file_path)
132
-
133
- try:
134
- _, file_ending = os.path.splitext(audio_file_path)
135
- out_file = audio_file_path.replace(file_ending, ".segments.json")
136
- prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
137
- if os.path.exists(out_file):
138
- logging.info("speech-to-text: Segments file already exists: %s", out_file)
139
- with open(out_file) as f:
140
- global segments
141
- segments = json.load(f)
142
- return segments
143
-
144
- logging.info('speech-to-text: Starting transcription...')
145
- options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
146
- transcribe_options = dict(task="transcribe", **options)
147
- # use function and config at top of file
148
- whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
149
- segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
150
-
151
- segments = []
152
- for segment_chunk in segments_raw:
153
- chunk = {
154
- "Time_Start": segment_chunk.start,
155
- "Time_End": segment_chunk.end,
156
- "Text": segment_chunk.text
157
- }
158
- logging.debug("Segment: %s", chunk)
159
- segments.append(chunk)
160
-
161
- if segments:
162
- segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
163
-
164
- if not segments:
165
- raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
166
- logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
167
-
168
- # Save the segments to a JSON file - prettified and non-prettified
169
- # FIXME so this is an optional flag to save either the prettified json file or the normal one
170
- save_json = True
171
- if save_json:
172
- logging.info("speech-to-text: Saving segments to JSON file")
173
- output_data = {'segments': segments}
174
-
175
- logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
176
- with open(prettified_out_file, 'w') as f:
177
- json.dump(output_data, f, indent=2)
178
-
179
- logging.info("speech-to-text: Saving JSON to %s", out_file)
180
- with open(out_file, 'w') as f:
181
- json.dump(output_data, f)
182
-
183
- logging.debug(f"speech-to-text: returning {segments[:500]}")
184
- gc.collect()
185
- return segments
186
-
187
- except Exception as e:
188
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
189
- raise RuntimeError("speech-to-text: Error transcribing audio")
190
-
191
-
192
- def record_audio(duration, sample_rate=16000, chunk_size=1024):
193
- p = pyaudio.PyAudio()
194
- stream = p.open(format=pyaudio.paInt16,
195
- channels=1,
196
- rate=sample_rate,
197
- input=True,
198
- frames_per_buffer=chunk_size)
199
-
200
- print("Recording...")
201
- frames = []
202
- stop_recording = threading.Event()
203
- audio_queue = queue.Queue()
204
-
205
- def audio_callback():
206
- for _ in range(0, int(sample_rate / chunk_size * duration)):
207
- if stop_recording.is_set():
208
- break
209
- data = stream.read(chunk_size)
210
- audio_queue.put(data)
211
-
212
- audio_thread = threading.Thread(target=audio_callback)
213
- audio_thread.start()
214
-
215
- return p, stream, audio_queue, stop_recording, audio_thread
216
-
217
-
218
- def stop_recording(p, stream, audio_queue, stop_recording_event, audio_thread):
219
- stop_recording_event.set()
220
- audio_thread.join()
221
-
222
- frames = []
223
- while not audio_queue.empty():
224
- frames.append(audio_queue.get())
225
-
226
- print("Recording finished.")
227
-
228
- stream.stop_stream()
229
- stream.close()
230
- p.terminate()
231
-
232
- return b''.join(frames)
233
-
234
- def save_audio_temp(audio_data, sample_rate=16000):
235
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
236
- import wave
237
- wf = wave.open(temp_file.name, 'wb')
238
- wf.setnchannels(1)
239
- wf.setsampwidth(2)
240
- wf.setframerate(sample_rate)
241
- wf.writeframes(audio_data)
242
- wf.close()
243
- return temp_file.name
244
-
245
- #
246
- #
247
  #######################################################################################################################
 
1
+ # Audio_Transcription_Lib.py
2
+ #########################################
3
+ # Transcription Library
4
+ # This library is used to perform transcription of audio files.
5
+ # Currently, uses faster_whisper for transcription.
6
+ #
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
+ # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
+ #
13
+ ####################
14
+ #
15
+ # Import necessary libraries to run solo for testing
16
+ import gc
17
+ import json
18
+ import logging
19
+ import os
20
+ import queue
21
+ import sys
22
+ import subprocess
23
+ import tempfile
24
+ import threading
25
+ import time
26
+ import configparser
27
+ # DEBUG Imports
28
+ #from memory_profiler import profile
29
+ #import pyaudio
30
+ # Import Local
31
+ #
32
+ #######################################################################################################################
33
+ # Function Definitions
34
+ #
35
+
36
+ # Convert video .m4a into .wav using ffmpeg
37
+ # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
38
+ # https://www.gyan.dev/ffmpeg/builds/
39
+ #
40
+
41
+
42
+ whisper_model_instance = None
43
+ # Retrieve processing choice from the configuration file
44
+ config = configparser.ConfigParser()
45
+ config.read('config.txt')
46
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
47
+
48
+
49
+ # FIXME: This is a temporary solution.
50
+ # This doesn't clear older models, which means potentially a lot of memory is being used...
51
+ def get_whisper_model(model_name, device):
52
+ global whisper_model_instance
53
+ if whisper_model_instance is None:
54
+ from faster_whisper import WhisperModel
55
+ logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
56
+ whisper_model_instance = WhisperModel(model_name, device=device)
57
+ return whisper_model_instance
58
+
59
+
60
+ # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
61
+ #DEBUG
62
+ #@profile
63
+ def convert_to_wav(video_file_path, offset=0, overwrite=False):
64
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
65
+
66
+ if os.path.exists(out_path) and not overwrite:
67
+ print(f"File '{out_path}' already exists. Skipping conversion.")
68
+ logging.info(f"Skipping conversion as file already exists: {out_path}")
69
+ return out_path
70
+ print("Starting conversion process of .m4a to .WAV")
71
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
72
+
73
+ try:
74
+ if os.name == "nt":
75
+ logging.debug("ffmpeg being ran on windows")
76
+
77
+ if sys.platform.startswith('win'):
78
+ ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
79
+ logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
80
+ else:
81
+ ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
82
+
83
+ command = [
84
+ ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
85
+ "-ss", "00:00:00", # Start at the beginning of the video
86
+ "-i", video_file_path,
87
+ "-ar", "16000", # Audio sample rate
88
+ "-ac", "1", # Number of audio channels
89
+ "-c:a", "pcm_s16le", # Audio codec
90
+ out_path
91
+ ]
92
+ try:
93
+ # Redirect stdin from null device to prevent ffmpeg from waiting for input
94
+ with open(os.devnull, 'rb') as null_file:
95
+ result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
96
+ if result.returncode == 0:
97
+ logging.info("FFmpeg executed successfully")
98
+ logging.debug("FFmpeg output: %s", result.stdout)
99
+ else:
100
+ logging.error("Error in running FFmpeg")
101
+ logging.error("FFmpeg stderr: %s", result.stderr)
102
+ raise RuntimeError(f"FFmpeg error: {result.stderr}")
103
+ except Exception as e:
104
+ logging.error("Error occurred - ffmpeg doesn't like windows")
105
+ raise RuntimeError("ffmpeg failed")
106
+ elif os.name == "posix":
107
+ os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
108
+ else:
109
+ raise RuntimeError("Unsupported operating system")
110
+ logging.info("Conversion to WAV completed: %s", out_path)
111
+ except subprocess.CalledProcessError as e:
112
+ logging.error("Error executing FFmpeg command: %s", str(e))
113
+ raise RuntimeError("Error converting video file to WAV")
114
+ except Exception as e:
115
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
116
+ return {"error": str(e)}
117
+ gc.collect()
118
+ return out_path
119
+
120
+
121
+ # Transcribe .wav into .segments.json
122
+ #DEBUG
123
+ #@profile
124
+ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
125
+ global whisper_model_instance, processing_choice
126
+ logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
127
+
128
+ time_start = time.time()
129
+ if audio_file_path is None:
130
+ raise ValueError("speech-to-text: No audio file provided")
131
+ logging.info("speech-to-text: Audio file path: %s", audio_file_path)
132
+
133
+ try:
134
+ _, file_ending = os.path.splitext(audio_file_path)
135
+ out_file = audio_file_path.replace(file_ending, ".segments.json")
136
+ prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
137
+ if os.path.exists(out_file):
138
+ logging.info("speech-to-text: Segments file already exists: %s", out_file)
139
+ with open(out_file) as f:
140
+ global segments
141
+ segments = json.load(f)
142
+ return segments
143
+
144
+ logging.info('speech-to-text: Starting transcription...')
145
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
146
+ transcribe_options = dict(task="transcribe", **options)
147
+ # use function and config at top of file
148
+ whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
149
+ segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
150
+
151
+ segments = []
152
+ for segment_chunk in segments_raw:
153
+ chunk = {
154
+ "Time_Start": segment_chunk.start,
155
+ "Time_End": segment_chunk.end,
156
+ "Text": segment_chunk.text
157
+ }
158
+ logging.debug("Segment: %s", chunk)
159
+ segments.append(chunk)
160
+
161
+ if segments:
162
+ segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
163
+
164
+ if not segments:
165
+ raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
166
+ logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
167
+
168
+ # Save the segments to a JSON file - prettified and non-prettified
169
+ # FIXME so this is an optional flag to save either the prettified json file or the normal one
170
+ save_json = True
171
+ if save_json:
172
+ logging.info("speech-to-text: Saving segments to JSON file")
173
+ output_data = {'segments': segments}
174
+
175
+ logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
176
+ with open(prettified_out_file, 'w') as f:
177
+ json.dump(output_data, f, indent=2)
178
+
179
+ logging.info("speech-to-text: Saving JSON to %s", out_file)
180
+ with open(out_file, 'w') as f:
181
+ json.dump(output_data, f)
182
+
183
+ logging.debug(f"speech-to-text: returning {segments[:500]}")
184
+ gc.collect()
185
+ return segments
186
+
187
+ except Exception as e:
188
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
189
+ raise RuntimeError("speech-to-text: Error transcribing audio")
190
+
191
+
192
+ #def record_audio(duration, sample_rate=16000, chunk_size=1024):
193
+ # p = pyaudio.PyAudio()
194
+ # stream = p.open(format=pyaudio.paInt16,
195
+ # channels=1,
196
+ # rate=sample_rate,
197
+ # input=True,
198
+ # frames_per_buffer=chunk_size)
199
+
200
+ # print("Recording...")
201
+ # frames = []
202
+ # stop_recording = threading.Event()
203
+ # audio_queue = queue.Queue()
204
+
205
+ def audio_callback():
206
+ for _ in range(0, int(sample_rate / chunk_size * duration)):
207
+ if stop_recording.is_set():
208
+ break
209
+ data = stream.read(chunk_size)
210
+ audio_queue.put(data)
211
+
212
+ audio_thread = threading.Thread(target=audio_callback)
213
+ audio_thread.start()
214
+
215
+ return p, stream, audio_queue, stop_recording, audio_thread
216
+
217
+
218
+ def stop_recording(p, stream, audio_queue, stop_recording_event, audio_thread):
219
+ stop_recording_event.set()
220
+ audio_thread.join()
221
+
222
+ frames = []
223
+ while not audio_queue.empty():
224
+ frames.append(audio_queue.get())
225
+
226
+ print("Recording finished.")
227
+
228
+ stream.stop_stream()
229
+ stream.close()
230
+ p.terminate()
231
+
232
+ return b''.join(frames)
233
+
234
+ def save_audio_temp(audio_data, sample_rate=16000):
235
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
236
+ import wave
237
+ wf = wave.open(temp_file.name, 'wb')
238
+ wf.setnchannels(1)
239
+ wf.setsampwidth(2)
240
+ wf.setframerate(sample_rate)
241
+ wf.writeframes(audio_data)
242
+ wf.close()
243
+ return temp_file.name
244
+
245
+ #
246
+ #
247
  #######################################################################################################################