oceansweep commited on
Commit
c7e020d
1 Parent(s): 8d1d1bc

Update App_Function_Libraries/Audio_Transcription_Lib.py

Browse files
App_Function_Libraries/Audio_Transcription_Lib.py CHANGED
@@ -1,158 +1,192 @@
1
- # Audio_Transcription_Lib.py
2
- #########################################
3
- # Transcription Library
4
- # This library is used to perform transcription of audio files.
5
- # Currently, uses faster_whisper for transcription.
6
- #
7
- ####
8
- import configparser
9
- ####################
10
- # Function List
11
- #
12
- # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
13
- # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
14
- #
15
- ####################
16
- #
17
- # Import necessary libraries to run solo for testing
18
- import json
19
- import logging
20
- import os
21
- import sys
22
- import subprocess
23
- import time
24
-
25
- # Import Local
26
- #
27
- #######################################################################################################################
28
- # Function Definitions
29
- #
30
-
31
- # Convert video .m4a into .wav using ffmpeg
32
- # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
33
- # https://www.gyan.dev/ffmpeg/builds/
34
- #
35
-
36
-
37
- # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
38
- def convert_to_wav(video_file_path, offset=0, overwrite=False):
39
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
40
-
41
- if os.path.exists(out_path) and not overwrite:
42
- print(f"File '{out_path}' already exists. Skipping conversion.")
43
- logging.info(f"Skipping conversion as file already exists: {out_path}")
44
- return out_path
45
- print("Starting conversion process of .m4a to .WAV")
46
- out_path = os.path.splitext(video_file_path)[0] + ".wav"
47
-
48
- try:
49
- if os.name == "nt":
50
- logging.debug("ffmpeg being ran on windows")
51
-
52
- if sys.platform.startswith('win'):
53
- ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
54
- logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
55
- else:
56
- ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
57
-
58
- command = [
59
- ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
60
- "-ss", "00:00:00", # Start at the beginning of the video
61
- "-i", video_file_path,
62
- "-ar", "16000", # Audio sample rate
63
- "-ac", "1", # Number of audio channels
64
- "-c:a", "pcm_s16le", # Audio codec
65
- out_path
66
- ]
67
- try:
68
- # Redirect stdin from null device to prevent ffmpeg from waiting for input
69
- with open(os.devnull, 'rb') as null_file:
70
- result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
71
- if result.returncode == 0:
72
- logging.info("FFmpeg executed successfully")
73
- logging.debug("FFmpeg output: %s", result.stdout)
74
- else:
75
- logging.error("Error in running FFmpeg")
76
- logging.error("FFmpeg stderr: %s", result.stderr)
77
- raise RuntimeError(f"FFmpeg error: {result.stderr}")
78
- except Exception as e:
79
- logging.error("Error occurred - ffmpeg doesn't like windows")
80
- raise RuntimeError("ffmpeg failed")
81
- elif os.name == "posix":
82
- os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
83
- else:
84
- raise RuntimeError("Unsupported operating system")
85
- logging.info("Conversion to WAV completed: %s", out_path)
86
- except subprocess.CalledProcessError as e:
87
- logging.error("Error executing FFmpeg command: %s", str(e))
88
- raise RuntimeError("Error converting video file to WAV")
89
- except Exception as e:
90
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
91
- return {"error": str(e)}
92
- return out_path
93
-
94
-
95
- # Transcribe .wav into .segments.json
96
- def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
97
- logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
98
- from faster_whisper import WhisperModel
99
- # Retrieve processing choice from the configuration file
100
- config = configparser.ConfigParser()
101
- config.read('config.txt')
102
- processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
103
- model = WhisperModel(whisper_model, device=f"{processing_choice}")
104
- time_start = time.time()
105
- if audio_file_path is None:
106
- raise ValueError("speech-to-text: No audio file provided")
107
- logging.info("speech-to-text: Audio file path: %s", audio_file_path)
108
-
109
- try:
110
- _, file_ending = os.path.splitext(audio_file_path)
111
- out_file = audio_file_path.replace(file_ending, ".segments.json")
112
- prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
113
- if os.path.exists(out_file):
114
- logging.info("speech-to-text: Segments file already exists: %s", out_file)
115
- with open(out_file) as f:
116
- global segments
117
- segments = json.load(f)
118
- return segments
119
-
120
- logging.info('speech-to-text: Starting transcription...')
121
- options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
122
- transcribe_options = dict(task="transcribe", **options)
123
- segments_raw, info = model.transcribe(audio_file_path, **transcribe_options)
124
-
125
- segments = []
126
- for segment_chunk in segments_raw:
127
- chunk = {
128
- "Time_Start": segment_chunk.start,
129
- "Time_End": segment_chunk.end,
130
- "Text": segment_chunk.text
131
- }
132
- logging.debug("Segment: %s", chunk)
133
- segments.append(chunk)
134
- if not segments:
135
- raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
136
- logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
137
-
138
- # Create a dictionary with the 'segments' key
139
- output_data = {'segments': segments}
140
-
141
- # Save prettified JSON
142
- logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
143
- with open(prettified_out_file, 'w') as f:
144
- json.dump(output_data, f, indent=2)
145
-
146
- # Save non-prettified JSON
147
- logging.info("speech-to-text: Saving JSON to %s", out_file)
148
- with open(out_file, 'w') as f:
149
- json.dump(output_data, f)
150
-
151
- except Exception as e:
152
- logging.error("speech-to-text: Error transcribing audio: %s", str(e))
153
- raise RuntimeError("speech-to-text: Error transcribing audio")
154
- return segments
155
-
156
- #
157
- #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  #######################################################################################################################
 
1
+ # Audio_Transcription_Lib.py
2
+ #########################################
3
+ # Transcription Library
4
+ # This library is used to perform transcription of audio files.
5
+ # Currently, uses faster_whisper for transcription.
6
+ #
7
+ ####
8
+ import configparser
9
+ ####################
10
+ # Function List
11
+ #
12
+ # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
13
+ # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
14
+ #
15
+ ####################
16
+ #
17
+ # Import necessary libraries to run solo for testing
18
+ import gc
19
+ import json
20
+ import logging
21
+ import os
22
+ import sys
23
+ import subprocess
24
+ import time
25
+
26
+ # DEBUG Imports
27
+ #from memory_profiler import profile
28
+
29
+ # Import Local
30
+ #
31
+ #######################################################################################################################
32
+ # Function Definitions
33
+ #
34
+
35
+ # Convert video .m4a into .wav using ffmpeg
36
+ # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
37
+ # https://www.gyan.dev/ffmpeg/builds/
38
+ #
39
+
40
+
41
+ whisper_model_instance = None
42
+ # Retrieve processing choice from the configuration file
43
+ config = configparser.ConfigParser()
44
+ config.read('config.txt')
45
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
46
+
47
+
48
+ # FIXME: This is a temporary solution.
49
+ # This doesn't clear older models, which means potentially a lot of memory is being used...
50
+ def get_whisper_model(model_name, device):
51
+ global whisper_model_instance
52
+ if whisper_model_instance is None:
53
+ from faster_whisper import WhisperModel
54
+ logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
55
+ whisper_model_instance = WhisperModel(model_name, device=device)
56
+ return whisper_model_instance
57
+
58
+
59
+ # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
60
+ #DEBUG
61
+ #@profile
62
+ def convert_to_wav(video_file_path, offset=0, overwrite=False):
63
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
64
+
65
+ if os.path.exists(out_path) and not overwrite:
66
+ print(f"File '{out_path}' already exists. Skipping conversion.")
67
+ logging.info(f"Skipping conversion as file already exists: {out_path}")
68
+ return out_path
69
+ print("Starting conversion process of .m4a to .WAV")
70
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
71
+
72
+ try:
73
+ if os.name == "nt":
74
+ logging.debug("ffmpeg being ran on windows")
75
+
76
+ if sys.platform.startswith('win'):
77
+ ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
78
+ logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
79
+ else:
80
+ ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
81
+
82
+ command = [
83
+ ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
84
+ "-ss", "00:00:00", # Start at the beginning of the video
85
+ "-i", video_file_path,
86
+ "-ar", "16000", # Audio sample rate
87
+ "-ac", "1", # Number of audio channels
88
+ "-c:a", "pcm_s16le", # Audio codec
89
+ out_path
90
+ ]
91
+ try:
92
+ # Redirect stdin from null device to prevent ffmpeg from waiting for input
93
+ with open(os.devnull, 'rb') as null_file:
94
+ result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
95
+ if result.returncode == 0:
96
+ logging.info("FFmpeg executed successfully")
97
+ logging.debug("FFmpeg output: %s", result.stdout)
98
+ else:
99
+ logging.error("Error in running FFmpeg")
100
+ logging.error("FFmpeg stderr: %s", result.stderr)
101
+ raise RuntimeError(f"FFmpeg error: {result.stderr}")
102
+ except Exception as e:
103
+ logging.error("Error occurred - ffmpeg doesn't like windows")
104
+ raise RuntimeError("ffmpeg failed")
105
+ elif os.name == "posix":
106
+ os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
107
+ else:
108
+ raise RuntimeError("Unsupported operating system")
109
+ logging.info("Conversion to WAV completed: %s", out_path)
110
+ except subprocess.CalledProcessError as e:
111
+ logging.error("Error executing FFmpeg command: %s", str(e))
112
+ raise RuntimeError("Error converting video file to WAV")
113
+ except Exception as e:
114
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
115
+ return {"error": str(e)}
116
+ gc.collect()
117
+ return out_path
118
+
119
+
120
+ # Transcribe .wav into .segments.json
121
+ #DEBUG
122
+ #@profile
123
+ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
124
+ global whisper_model_instance, processing_choice
125
+ logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
126
+
127
+ time_start = time.time()
128
+ if audio_file_path is None:
129
+ raise ValueError("speech-to-text: No audio file provided")
130
+ logging.info("speech-to-text: Audio file path: %s", audio_file_path)
131
+
132
+ try:
133
+ _, file_ending = os.path.splitext(audio_file_path)
134
+ out_file = audio_file_path.replace(file_ending, ".segments.json")
135
+ prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
136
+ if os.path.exists(out_file):
137
+ logging.info("speech-to-text: Segments file already exists: %s", out_file)
138
+ with open(out_file) as f:
139
+ global segments
140
+ segments = json.load(f)
141
+ return segments
142
+
143
+ logging.info('speech-to-text: Starting transcription...')
144
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
145
+ transcribe_options = dict(task="transcribe", **options)
146
+ # use function and config at top of file
147
+ whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
148
+ segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
149
+
150
+ segments = []
151
+ for segment_chunk in segments_raw:
152
+ chunk = {
153
+ "Time_Start": segment_chunk.start,
154
+ "Time_End": segment_chunk.end,
155
+ "Text": segment_chunk.text
156
+ }
157
+ logging.debug("Segment: %s", chunk)
158
+ segments.append(chunk)
159
+
160
+ if segments:
161
+ segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
162
+
163
+ if not segments:
164
+ raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
165
+ logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
166
+
167
+ # Save the segments to a JSON file - prettified and non-prettified
168
+ # FIXME so this is an optional flag to save either the prettified json file or the normal one
169
+ save_json = True
170
+ if save_json:
171
+ logging.info("speech-to-text: Saving segments to JSON file")
172
+ output_data = {'segments': segments}
173
+
174
+ logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
175
+ with open(prettified_out_file, 'w') as f:
176
+ json.dump(output_data, f, indent=2)
177
+
178
+ logging.info("speech-to-text: Saving JSON to %s", out_file)
179
+ with open(out_file, 'w') as f:
180
+ json.dump(output_data, f)
181
+
182
+ logging.debug(f"speech-to-text: returning {segments[:500]}")
183
+ gc.collect()
184
+ return segments
185
+
186
+ except Exception as e:
187
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
188
+ raise RuntimeError("speech-to-text: Error transcribing audio")
189
+
190
+ #
191
+ #
192
  #######################################################################################################################