faster-whisper-webui

Running

App Files Files Community

aadnk commited on Nov 26, 2022

Commit

c0e541b

•

1 Parent(s): 60f71a4

Support parallel execution of Silero VAD

Browse files

Files changed (7) hide show

app.py +28 -13
cli.py +3 -1
src/modelCache.py +17 -0
src/vad.py +60 -28
src/vadParallel.py +93 -23
src/whisperContainer.py +12 -26
tests/vad_test.py +2 -2

app.py CHANGED Viewed

@@ -6,10 +6,9 @@ from io import StringIO
 import os
 import pathlib
 import tempfile
 from src.vadParallel import ParallelContext, ParallelTranscription
-from src.whisperContainer import WhisperContainer, WhisperModelCache
 # External programs
 import ffmpeg
@@ -19,6 +18,7 @@ import gradio as gr
 from src.download import ExceededMaximumDuration, download_url
 from src.utils import slugify, write_srt, write_vtt
 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
 # Limitations (set to -1 to disable)
 DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
@@ -50,11 +50,13 @@ LANGUAGES = [
 ]
 class WhisperTranscriber:
-    def __init__(self, input_audio_max_duration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, vad_process_timeout: float = None, delete_uploaded_files: bool = DELETE_UPLOADED_FILES):
-        self.model_cache = WhisperModelCache()
         self.parallel_device_list = None
-        self.parallel_context = None
         self.vad_process_timeout = vad_process_timeout
         self.vad_model = None
         self.inputAudioMaxDuration = input_audio_max_duration
@@ -142,17 +144,27 @@ class WhisperTranscriber:
             # No parallel devices, so just run the VAD and Whisper in sequence
             return vadModel.transcribe(audio_path, whisperCallable, vadConfig)
         # Create parallel context if needed
-        if (self.parallel_context is None):
             # Create a context wih processes and automatically clear the pool after 1 hour of inactivity
-            self.parallel_context = ParallelContext(num_processes=len(self.parallel_device_list), auto_cleanup_timeout_seconds=self.vad_process_timeout)
         parallel_vad = ParallelTranscription()
         return parallel_vad.transcribe_parallel(transcription=vadModel, audio=audio_path, whisperCallable=whisperCallable,
-                                                config=vadConfig, devices=self.parallel_device_list, parallel_context=self.parallel_context)
     def _has_parallel_devices(self):
-        return self.parallel_device_list is not None and len(self.parallel_device_list) > 0
     def _concat_prompt(self, prompt1, prompt2):
         if (prompt1 is None):
@@ -249,13 +261,15 @@ class WhisperTranscriber:
     def close(self):
         self.clear_cache()
-        if (self.parallel_context is not None):
-            self.parallel_context.close()
 def create_ui(input_audio_max_duration, share=False, server_name: str = None, server_port: int = 7860,
-              default_model_name: str = "medium", default_vad: str = None, vad_parallel_devices: str = None, vad_process_timeout: float = None):
-    ui = WhisperTranscriber(input_audio_max_duration, vad_process_timeout)
     # Specify a list of devices to use for parallel processing
     ui.set_parallel_devices(vad_parallel_devices)
@@ -303,6 +317,7 @@ if __name__ == '__main__':
     parser.add_argument("--default_model_name", type=str, default="medium", help="The default model name.")
     parser.add_argument("--default_vad", type=str, default="silero-vad", help="The default VAD.")
     parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
     parser.add_argument("--vad_process_timeout", type=float, default="1800", help="The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.")
     args = parser.parse_args().__dict__

 import os
 import pathlib
 import tempfile
+from src.modelCache import ModelCache
 from src.vadParallel import ParallelContext, ParallelTranscription
 # External programs
 import ffmpeg
 from src.download import ExceededMaximumDuration, download_url
 from src.utils import slugify, write_srt, write_vtt
 from src.vad import AbstractTranscription, NonSpeechStrategy, PeriodicTranscriptionConfig, TranscriptionConfig, VadPeriodicTranscription, VadSileroTranscription
+from src.whisperContainer import WhisperContainer
 # Limitations (set to -1 to disable)
 DEFAULT_INPUT_AUDIO_MAX_DURATION = 600 # seconds
 ]
 class WhisperTranscriber:
+    def __init__(self, input_audio_max_duration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, vad_process_timeout: float = None, vad_cpu_cores: int = 1, delete_uploaded_files: bool = DELETE_UPLOADED_FILES):
+        self.model_cache = ModelCache()
         self.parallel_device_list = None
+        self.gpu_parallel_context = None
+        self.cpu_parallel_context = None
         self.vad_process_timeout = vad_process_timeout
+        self.vad_cpu_cores = vad_cpu_cores
         self.vad_model = None
         self.inputAudioMaxDuration = input_audio_max_duration
             # No parallel devices, so just run the VAD and Whisper in sequence
             return vadModel.transcribe(audio_path, whisperCallable, vadConfig)
+        gpu_devices = self.parallel_device_list
+        if (gpu_devices is None or len(gpu_devices) == 0):
+            # No GPU devices specified, pass the current environment variable to the first GPU process. This may be NULL.
+            gpu_devices = [os.environ.get("CUDA_VISIBLE_DEVICES", None)]
         # Create parallel context if needed
+        if (self.gpu_parallel_context is None):
             # Create a context wih processes and automatically clear the pool after 1 hour of inactivity
+            self.gpu_parallel_context = ParallelContext(num_processes=len(gpu_devices), auto_cleanup_timeout_seconds=self.vad_process_timeout)
+        # We also need a CPU context for the VAD
+        if (self.cpu_parallel_context is None):
+            self.cpu_parallel_context = ParallelContext(num_processes=self.vad_cpu_cores, auto_cleanup_timeout_seconds=self.vad_process_timeout)
         parallel_vad = ParallelTranscription()
         return parallel_vad.transcribe_parallel(transcription=vadModel, audio=audio_path, whisperCallable=whisperCallable,
+                                                config=vadConfig, cpu_device_count=self.vad_cpu_cores, gpu_devices=gpu_devices,
+                                                cpu_parallel_context=self.cpu_parallel_context, gpu_parallel_context=self.gpu_parallel_context)
     def _has_parallel_devices(self):
+        return (self.parallel_device_list is not None and len(self.parallel_device_list) > 0) or self.vad_cpu_cores > 1
     def _concat_prompt(self, prompt1, prompt2):
         if (prompt1 is None):
     def close(self):
         self.clear_cache()
+        if (self.gpu_parallel_context is not None):
+            self.gpu_parallel_context.close()
+        if (self.cpu_parallel_context is not None):
+            self.cpu_parallel_context.close()
 def create_ui(input_audio_max_duration, share=False, server_name: str = None, server_port: int = 7860,
+              default_model_name: str = "medium", default_vad: str = None, vad_parallel_devices: str = None, vad_process_timeout: float = None, vad_cpu_cores: int = 1):
+    ui = WhisperTranscriber(input_audio_max_duration, vad_process_timeout, vad_cpu_cores)
     # Specify a list of devices to use for parallel processing
     ui.set_parallel_devices(vad_parallel_devices)
     parser.add_argument("--default_model_name", type=str, default="medium", help="The default model name.")
     parser.add_argument("--default_vad", type=str, default="silero-vad", help="The default VAD.")
     parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
+    parser.add_argument("--vad_cpu_cores", type=int, default=1, help="The number of CPU cores to use for VAD pre-processing.")
     parser.add_argument("--vad_process_timeout", type=float, default="1800", help="The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout.")
     args = parser.parse_args().__dict__

cli.py CHANGED Viewed

@@ -32,6 +32,7 @@ def cli():
     parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
     parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
     parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
     parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
     parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
@@ -73,8 +74,9 @@ def cli():
     vad_max_merge_size = args.pop("vad_max_merge_size")
     vad_padding = args.pop("vad_padding")
     vad_prompt_window = args.pop("vad_prompt_window")
-    model = WhisperContainer(model_name, device=device, download_root=model_dir)
     transcriber = WhisperTranscriber(delete_uploaded_files=False)
     transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))

     parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
     parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
     parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
+    parser.add_argument("--vad_cpu_cores", type=int, default=1, help="The number of CPU cores to use for VAD pre-processing.")
     parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing.")
     parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
     vad_max_merge_size = args.pop("vad_max_merge_size")
     vad_padding = args.pop("vad_padding")
     vad_prompt_window = args.pop("vad_prompt_window")
+    vad_cpu_cores = args.pop("vad_cpu_cores")
+    model = WhisperContainer(model_name, device=device, download_root=model_dir, vad_cpu_cores=vad_cpu_cores)
     transcriber = WhisperTranscriber(delete_uploaded_files=False)
     transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))

src/modelCache.py ADDED Viewed

	@@ -0,0 +1,17 @@

+class ModelCache:
+    def __init__(self):
+        self._cache = dict()
+    def get(self, model_key: str, model_factory):
+        result = self._cache.get(model_key)
+        if result is None:
+            result = model_factory()
+            self._cache[model_key] = result
+        return result
+    def clear(self):
+        self._cache.clear()
+# A global cache of models. This is mainly used by the daemon processes to avoid loading the same model multiple times.
+GLOBAL_MODEL_CACHE = ModelCache()

src/vad.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from abc import ABC, abstractmethod
 from collections import Counter, deque
 from typing import Any, Deque, Iterator, List, Dict
 from pprint import pprint
 from src.segments import merge_timestamps
 from src.whisperContainer import WhisperCallback
@@ -76,7 +78,7 @@ class AbstractTranscription(ABC):
         return load_audio(str, self.sampling_rate, start_time, duration)
     @abstractmethod
-    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig):
         """
         Get the start and end timestamps of the sections that should be transcribed by this VAD method.
@@ -93,10 +95,10 @@ class AbstractTranscription(ABC):
         """
         return
-    def get_merged_timestamps(self, audio: str, config: TranscriptionConfig):
         """
         Get the start and end timestamps of the sections that should be transcribed by this VAD method,
-        after merging the segments using the specified configuration.
         Parameters
         ----------
@@ -109,21 +111,17 @@ class AbstractTranscription(ABC):
         -------
         A list of start and end timestamps, in fractional seconds.
         """
-        seconds_timestamps = self.get_transcribe_timestamps(audio, config)
-        merged = merge_timestamps(seconds_timestamps, config.max_silent_period, config.max_merge_size,
                                   config.segment_padding_left, config.segment_padding_right)
         if config.non_speech_strategy != NonSpeechStrategy.SKIP:
-            max_audio_duration = get_audio_duration(audio)
             # Expand segments to include the gaps between them
             if (config.non_speech_strategy == NonSpeechStrategy.CREATE_SEGMENT):
                 # When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
-                merged = self.fill_gaps(merged, total_duration=max_audio_duration, max_expand_size=config.max_merge_size)
             elif config.non_speech_strategy == NonSpeechStrategy.EXPAND_SEGMENT:
                 # With no prompt window, it is better to just expand the segments (this effectively passes the prompt to the next segment)
-                merged = self.expand_gaps(merged, total_duration=max_audio_duration)
             else:
                 raise Exception("Unknown non-speech strategy: " + str(config.non_speech_strategy))
@@ -147,8 +145,11 @@ class AbstractTranscription(ABC):
         A list of start and end timestamps, in fractional seconds.
         """
         # Get speech timestamps from full audio file
-        merged = self.get_merged_timestamps(audio, config)
         # A deque of transcribed segments that is passed to the next segment as a prompt
         prompt_window = deque()
@@ -392,22 +393,41 @@ class AbstractTranscription(ABC):
 class VadSileroTranscription(AbstractTranscription):
-    def __init__(self, sampling_rate: int = 16000):
         super().__init__(sampling_rate=sampling_rate)
-        self.model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
-        (self.get_speech_timestamps, _, _, _, _) = utils
-    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig):
-        audio_duration = get_audio_duration(audio)
         result = []
         # Divide procesisng of audio into chunks
-        chunk_start = 0.0
-        while (chunk_start < audio_duration):
-            chunk_duration = min(audio_duration - chunk_start, VAD_MAX_PROCESSING_CHUNK)
             print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
             wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
@@ -421,23 +441,35 @@ class VadSileroTranscription(AbstractTranscription):
             result.extend(adjusted)
             chunk_start += chunk_duration
         return result
 # A very simple VAD that just marks every N seconds as speech
 class VadPeriodicTranscription(AbstractTranscription):
     def __init__(self, sampling_rate: int = 16000):
         super().__init__(sampling_rate=sampling_rate)
-    def get_transcribe_timestamps(self, audio: str, config: PeriodicTranscriptionConfig):
-        # Get duration in seconds
-        audio_duration = get_audio_duration(audio)
         result = []
         # Generate a timestamp every N seconds
-        start_timestamp = 0
-        while (start_timestamp < audio_duration):
-            end_timestamp = min(start_timestamp + config.periodic_duration, audio_duration)
             segment_duration = end_timestamp - start_timestamp
             # Minimum duration is 1 second

 from abc import ABC, abstractmethod
 from collections import Counter, deque
+import time
 from typing import Any, Deque, Iterator, List, Dict
 from pprint import pprint
+from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
 from src.segments import merge_timestamps
 from src.whisperContainer import WhisperCallback
         return load_audio(str, self.sampling_rate, start_time, duration)
     @abstractmethod
+    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig, start_time: float, end_time: float):
         """
         Get the start and end timestamps of the sections that should be transcribed by this VAD method.
         """
         return
+    def get_merged_timestamps(self, timestamps: List[Dict[str, Any]], config: TranscriptionConfig, total_duration: float):
         """
         Get the start and end timestamps of the sections that should be transcribed by this VAD method,
+        after merging the given segments using the specified configuration.
         Parameters
         ----------
         -------
         A list of start and end timestamps, in fractional seconds.
         """
+        merged = merge_timestamps(timestamps, config.max_silent_period, config.max_merge_size,
                                   config.segment_padding_left, config.segment_padding_right)
         if config.non_speech_strategy != NonSpeechStrategy.SKIP:
             # Expand segments to include the gaps between them
             if (config.non_speech_strategy == NonSpeechStrategy.CREATE_SEGMENT):
                 # When we have a prompt window, we create speech segments betwen each segment if we exceed the merge size
+                merged = self.fill_gaps(merged, total_duration=total_duration, max_expand_size=config.max_merge_size)
             elif config.non_speech_strategy == NonSpeechStrategy.EXPAND_SEGMENT:
                 # With no prompt window, it is better to just expand the segments (this effectively passes the prompt to the next segment)
+                merged = self.expand_gaps(merged, total_duration=total_duration)
             else:
                 raise Exception("Unknown non-speech strategy: " + str(config.non_speech_strategy))
         A list of start and end timestamps, in fractional seconds.
         """
+        max_audio_duration = get_audio_duration(audio)
+        timestamp_segments = self.get_transcribe_timestamps(audio, config, 0, max_audio_duration)
         # Get speech timestamps from full audio file
+        merged = self.get_merged_timestamps(timestamp_segments, config, max_audio_duration)
         # A deque of transcribed segments that is passed to the next segment as a prompt
         prompt_window = deque()
 class VadSileroTranscription(AbstractTranscription):
+    def __init__(self, sampling_rate: int = 16000, cache: ModelCache = None):
         super().__init__(sampling_rate=sampling_rate)
+        self.model = None
+        self.cache = cache
+        self._initialize_model()
+    def _initialize_model(self):
+        if (self.cache is not None):
+            model_key = "VadSileroTranscription"
+            self.model, self.get_speech_timestamps = self.cache.get(model_key, self._create_model)
+            print("Loaded Silerio model from cache.")
+        else:
+            self.model, self.get_speech_timestamps = self._create_model()
+            print("Created Silerio model")
+    def _create_model(self):
+        model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad')
+        # Silero does not benefit from multi-threading
+        torch.set_num_threads(1) # JIT
+        (get_speech_timestamps, _, _, _, _) = utils
+        return model, get_speech_timestamps
+    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig, start_time: float, end_time: float):
         result = []
+        print("Getting timestamps from audio file: {}, start: {}, duration: {}".format(audio, start_time, end_time))
+        perf_start_time = time.perf_counter()
         # Divide procesisng of audio into chunks
+        chunk_start = start_time
+        while (chunk_start < end_time):
+            chunk_duration = min(end_time - chunk_start, VAD_MAX_PROCESSING_CHUNK)
             print("Processing VAD in chunk from {} to {}".format(format_timestamp(chunk_start), format_timestamp(chunk_start + chunk_duration)))
             wav = self.get_audio_segment(audio, str(chunk_start), str(chunk_duration))
             result.extend(adjusted)
             chunk_start += chunk_duration
+        perf_end_time = time.perf_counter()
+        print("VAD processing took {} seconds".format(perf_end_time - perf_start_time))
         return result
+    def __getstate__(self):
+        # We only need the sampling rate
+        return { 'sampling_rate': self.sampling_rate }
+    def __setstate__(self, state):
+        self.sampling_rate = state['sampling_rate']
+        self.model = None
+        # Use the global cache
+        self.cache = GLOBAL_MODEL_CACHE
+        self._initialize_model()
 # A very simple VAD that just marks every N seconds as speech
 class VadPeriodicTranscription(AbstractTranscription):
     def __init__(self, sampling_rate: int = 16000):
         super().__init__(sampling_rate=sampling_rate)
+    def get_transcribe_timestamps(self, audio: str, config: PeriodicTranscriptionConfig, start_time: float, end_time: float):
         result = []
         # Generate a timestamp every N seconds
+        start_timestamp = start_time
+        while (start_timestamp < end_time):
+            end_timestamp = min(start_timestamp + config.periodic_duration, end_time)
             segment_duration = end_timestamp - start_timestamp
             # Minimum duration is 1 second

src/vadParallel.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import multiprocessing
 import threading
 import time
-from src.vad import AbstractTranscription, TranscriptionConfig
 from src.whisperContainer import WhisperCallback
 from multiprocessing import Pool
-from typing import List
 import os
@@ -76,19 +76,28 @@ class ParallelTranscriptionConfig(TranscriptionConfig):
         super().__init__(copy.non_speech_strategy, copy.segment_padding_left, copy.segment_padding_right, copy.max_silent_period, copy.max_merge_size, copy.max_prompt_window, initial_segment_index)
         self.device_id = device_id
         self.override_timestamps = override_timestamps
 class ParallelTranscription(AbstractTranscription):
     def __init__(self, sampling_rate: int = 16000):
         super().__init__(sampling_rate=sampling_rate)
-    def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig, devices: List[str], parallel_context: ParallelContext = None):
         # First, get the timestamps for the original audio
-        merged = transcription.get_merged_timestamps(audio, config)
         # Split into a list for each device
         # TODO: Split by time instead of by number of chunks
-        merged_split = list(self._split(merged, len(devices)))
         # Parameters that will be passed to the transcribe function
         parameters = []
@@ -96,15 +105,15 @@ class ParallelTranscription(AbstractTranscription):
         for i in range(len(merged_split)):
             device_segment_list = list(merged_split[i])
-            device_id = devices[i]
             if (len(device_segment_list) <= 0):
                 continue
-            print("Device " + device_id + " (index " + str(i) + ") has " + str(len(device_segment_list)) + " segments")
             # Create a new config with the given device ID
-            device_config = ParallelTranscriptionConfig(devices[i], device_segment_list, segment_index, config)
             segment_index += len(device_segment_list)
             parameters.append([audio, whisperCallable, device_config]);
@@ -119,12 +128,12 @@ class ParallelTranscription(AbstractTranscription):
         # Spawn a separate process for each device
         try:
-            if (parallel_context is None):
-                parallel_context = ParallelContext(len(devices))
                 created_context = True
             # Get a pool of processes
-            pool = parallel_context.get_pool()
             # Run the transcription in parallel
             results = pool.starmap(self.transcribe, parameters)
@@ -140,29 +149,90 @@ class ParallelTranscription(AbstractTranscription):
         finally:
             # Return the pool to the context
-            if (parallel_context is not None):
-                parallel_context.return_pool(pool)
             # Always close the context if we created it
             if (created_context):
-                parallel_context.close()
         return merged
-    def get_transcribe_timestamps(self, audio: str, config: ParallelTranscriptionConfig):
         return []
-    def get_merged_timestamps(self, audio: str, config: ParallelTranscriptionConfig):
         # Override timestamps that will be processed
         if (config.override_timestamps is not None):
             print("Using override timestamps of size " + str(len(config.override_timestamps)))
             return config.override_timestamps
-        return super().get_merged_timestamps(audio, config)
     def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: ParallelTranscriptionConfig):
-        # Override device ID
-        if (config.device_id is not None):
-            print("Using device " + config.device_id)
-            os.environ["CUDA_VISIBLE_DEVICES"] = config.device_id
         return super().transcribe(audio, whisperCallable, config)
     def _split(self, a, n):

 import multiprocessing
 import threading
 import time
+from src.vad import AbstractTranscription, TranscriptionConfig, get_audio_duration
 from src.whisperContainer import WhisperCallback
 from multiprocessing import Pool
+from typing import Any, Dict, List
 import os
         super().__init__(copy.non_speech_strategy, copy.segment_padding_left, copy.segment_padding_right, copy.max_silent_period, copy.max_merge_size, copy.max_prompt_window, initial_segment_index)
         self.device_id = device_id
         self.override_timestamps = override_timestamps
 class ParallelTranscription(AbstractTranscription):
+    # Silero VAD typically takes about 3 seconds per minute, so there's no need to split the chunks
+    # into smaller segments than 2 minute (min 6 seconds per CPU core)
+    MIN_CPU_CHUNK_SIZE_SECONDS = 2 * 60
     def __init__(self, sampling_rate: int = 16000):
         super().__init__(sampling_rate=sampling_rate)
+    def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig,
+                            cpu_device_count: int, gpu_devices: List[str], cpu_parallel_context: ParallelContext = None, gpu_parallel_context: ParallelContext = None):
+        total_duration = get_audio_duration(audio)
         # First, get the timestamps for the original audio
+        if (cpu_device_count > 1):
+            merged = self._get_merged_timestamps_parallel(transcription, audio, config, total_duration, cpu_device_count, cpu_parallel_context)
+        else:
+            merged = transcription.get_merged_timestamps(audio, config, total_duration)
         # Split into a list for each device
         # TODO: Split by time instead of by number of chunks
+        merged_split = list(self._split(merged, len(gpu_devices)))
         # Parameters that will be passed to the transcribe function
         parameters = []
         for i in range(len(merged_split)):
             device_segment_list = list(merged_split[i])
+            device_id = gpu_devices[i]
             if (len(device_segment_list) <= 0):
                 continue
+            print("Device " + str(device_id) + " (index " + str(i) + ") has " + str(len(device_segment_list)) + " segments")
             # Create a new config with the given device ID
+            device_config = ParallelTranscriptionConfig(device_id, device_segment_list, segment_index, config)
             segment_index += len(device_segment_list)
             parameters.append([audio, whisperCallable, device_config]);
         # Spawn a separate process for each device
         try:
+            if (gpu_parallel_context is None):
+                gpu_parallel_context = ParallelContext(len(gpu_devices))
                 created_context = True
             # Get a pool of processes
+            pool = gpu_parallel_context.get_pool()
             # Run the transcription in parallel
             results = pool.starmap(self.transcribe, parameters)
         finally:
             # Return the pool to the context
+            if (gpu_parallel_context is not None):
+                gpu_parallel_context.return_pool(pool)
             # Always close the context if we created it
             if (created_context):
+                gpu_parallel_context.close()
         return merged
+    def _get_merged_timestamps_parallel(self, transcription: AbstractTranscription, audio: str, config: TranscriptionConfig, total_duration: float,
+                                       cpu_device_count: int, cpu_parallel_context: ParallelContext = None):
+        parameters = []
+        chunk_size = max(total_duration / cpu_device_count, self.MIN_CPU_CHUNK_SIZE_SECONDS)
+        chunk_start = 0
+        cpu_device_id = 0
+        perf_start_time = time.perf_counter()
+        # Create chunks that will be processed on the CPU
+        while (chunk_start < total_duration):
+            chunk_end = min(chunk_start + chunk_size, total_duration)
+            print("Parallel VAD: Executing chunk from " + str(chunk_start) + " to " +
+                    str(chunk_end) + " on CPU device " + str(cpu_device_id))
+            parameters.append([audio, config, chunk_start, chunk_end]);
+            cpu_device_id += 1
+            chunk_start = chunk_end
+        created_context = False
+        # Spawn a separate process for each device
+        try:
+            if (cpu_parallel_context is None):
+                cpu_parallel_context = ParallelContext(cpu_device_count)
+                created_context = True
+            # Get a pool of processes
+            pool = cpu_parallel_context.get_pool()
+            # Run the transcription in parallel. Note that transcription must be picklable.
+            results = pool.starmap(transcription.get_transcribe_timestamps, parameters)
+            timestamps = []
+            # Flatten the results
+            for result in results:
+                timestamps.extend(result)
+            merged = transcription.get_merged_timestamps(timestamps, config, total_duration)
+            perf_end_time = time.perf_counter()
+            print("Parallel VAD processing took {} seconds".format(perf_end_time - perf_start_time))
+            return merged
+        finally:
+            # Return the pool to the context
+            if (cpu_parallel_context is not None):
+                cpu_parallel_context.return_pool(pool)
+            # Always close the context if we created it
+            if (created_context):
+                cpu_parallel_context.close()
+    def get_transcribe_timestamps(self, audio: str, config: ParallelTranscriptionConfig, start_time: float, duration: float):
         return []
+    def get_merged_timestamps(self,  timestamps: List[Dict[str, Any]], config: ParallelTranscriptionConfig, total_duration: float):
         # Override timestamps that will be processed
         if (config.override_timestamps is not None):
             print("Using override timestamps of size " + str(len(config.override_timestamps)))
             return config.override_timestamps
+        return super().get_merged_timestamps(timestamps, config, total_duration)
     def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: ParallelTranscriptionConfig):
+        # Override device ID the first time
+        if (os.environ.get("INITIALIZED", None) is None):
+            os.environ["INITIALIZED"] = "1"
+            # Note that this may be None if the user didn't specify a device. In that case, Whisper will
+            # just use the default GPU device.
+            if (config.device_id is not None):
+                print("Using device " + config.device_id)
+                os.environ["CUDA_VISIBLE_DEVICES"] = config.device_id
         return super().transcribe(audio, whisperCallable, config)
     def _split(self, a, n):

src/whisperContainer.py CHANGED Viewed

@@ -1,29 +1,10 @@
 # External programs
 import whisper
-class WhisperModelCache:
-    def __init__(self):
-        self._cache = dict()
-    def get(self, model_name, device: str = None):
-        key = model_name + ":" + (device if device else '')
-        result = self._cache.get(key)
-        if result is None:
-            print("Loading whisper model " + model_name)
-            result = whisper.load_model(name=model_name, device=device)
-            self._cache[key] = result
-        return result
-    def clear(self):
-        self._cache.clear()
-# A global cache of models. This is mainly used by the daemon processes to avoid loading the same model multiple times.
-GLOBAL_WHISPER_MODEL_CACHE = WhisperModelCache()
 class WhisperContainer:
-    def __init__(self, model_name: str, device: str = None, download_root: str = None, cache: WhisperModelCache = None):
         self.model_name = model_name
         self.device = device
         self.download_root = download_root
@@ -36,12 +17,16 @@ class WhisperContainer:
         if self.model is None:
             if (self.cache is None):
-                print("Loading whisper model " + self.model_name)
-                self.model = whisper.load_model(self.model_name, device=self.device, download_root=self.download_root)
             else:
-                self.model = self.cache.get(self.model_name, device=self.device)
         return self.model
     def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
         """
         Create a WhisperCallback object that can be used to transcript audio files.
@@ -65,14 +50,15 @@ class WhisperContainer:
     # This is required for multiprocessing
     def __getstate__(self):
-        return { "model_name": self.model_name, "device": self.device }
     def __setstate__(self, state):
         self.model_name = state["model_name"]
         self.device = state["device"]
         self.model = None
         # Depickled objects must use the global cache
-        self.cache = GLOBAL_WHISPER_MODEL_CACHE
 class WhisperCallback:

 # External programs
 import whisper
+from src.modelCache import GLOBAL_MODEL_CACHE, ModelCache
 class WhisperContainer:
+    def __init__(self, model_name: str, device: str = None, download_root: str = None, cache: ModelCache = None):
         self.model_name = model_name
         self.device = device
         self.download_root = download_root
         if self.model is None:
             if (self.cache is None):
+                self.model = self._create_model()
             else:
+                model_key = "WhisperContainer." + self.model_name + ":" + (self.device if self.device else '')
+                self.model = self.cache.get(model_key, self._create_model)
         return self.model
+    def _create_model(self):
+        print("Loading whisper model " + self.model_name)
+        return whisper.load_model(self.model_name, device=self.device, download_root=self.download_root)
     def create_callback(self, language: str = None, task: str = None, initial_prompt: str = None, **decodeOptions: dict):
         """
         Create a WhisperCallback object that can be used to transcript audio files.
     # This is required for multiprocessing
     def __getstate__(self):
+        return { "model_name": self.model_name, "device": self.device, "download_root": self.download_root }
     def __setstate__(self, state):
         self.model_name = state["model_name"]
         self.device = state["device"]
+        self.download_root = state["download_root"]
         self.model = None
         # Depickled objects must use the global cache
+        self.cache = GLOBAL_MODEL_CACHE
 class WhisperCallback:

tests/vad_test.py CHANGED Viewed

@@ -5,7 +5,7 @@ import sys
 sys.path.append('../whisper-webui')
-from src.vad import AbstractTranscription, VadSileroTranscription
 class TestVad(unittest.TestCase):
     def __init__(self, *args, **kwargs):
@@ -55,7 +55,7 @@ class MockVadTranscription(AbstractTranscription):
         # For mocking, this just returns a simple numppy array
         return np.array([start_time_seconds, duration_seconds], dtype=np.float64)
-    def get_transcribe_timestamps(self, audio: str):
         result = []
         result.append( {  'start': 30, 'end': 60 } )

 sys.path.append('../whisper-webui')
+from src.vad import AbstractTranscription, TranscriptionConfig, VadSileroTranscription
 class TestVad(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         # For mocking, this just returns a simple numppy array
         return np.array([start_time_seconds, duration_seconds], dtype=np.float64)
+    def get_transcribe_timestamps(self, audio: str, config: TranscriptionConfig, start_time: float, duration: float):
         result = []
         result.append( {  'start': 30, 'end': 60 } )