whisper-webui-translate

Sleeping

App Files Files Community

aadnk commited on Mar 27, 2023

Commit

4a9d465

•

1 Parent(s): 33a2c1e

Support progress for multiple devices

Browse files

Files changed (3) hide show

app.py +2 -2
src/vad.py +78 -66
src/vadParallel.py +50 -8

app.py CHANGED Viewed

@@ -279,7 +279,6 @@ class WhisperTranscriber:
             # No parallel devices, so just run the VAD and Whisper in sequence
             return vadModel.transcribe(audio_path, whisperCallable, vadConfig, progressListener=progressListener)
-        # TODO: Handle progress listener
         gpu_devices = self.parallel_device_list
         if (gpu_devices is None or len(gpu_devices) == 0):
@@ -297,7 +296,8 @@ class WhisperTranscriber:
         parallel_vad = ParallelTranscription()
         return parallel_vad.transcribe_parallel(transcription=vadModel, audio=audio_path, whisperCallable=whisperCallable,
                                                 config=vadConfig, cpu_device_count=self.vad_cpu_cores, gpu_devices=gpu_devices,
-                                                cpu_parallel_context=self.cpu_parallel_context, gpu_parallel_context=self.gpu_parallel_context)
     def _has_parallel_devices(self):
         return (self.parallel_device_list is not None and len(self.parallel_device_list) > 0) or self.vad_cpu_cores > 1

             # No parallel devices, so just run the VAD and Whisper in sequence
             return vadModel.transcribe(audio_path, whisperCallable, vadConfig, progressListener=progressListener)
         gpu_devices = self.parallel_device_list
         if (gpu_devices is None or len(gpu_devices) == 0):
         parallel_vad = ParallelTranscription()
         return parallel_vad.transcribe_parallel(transcription=vadModel, audio=audio_path, whisperCallable=whisperCallable,
                                                 config=vadConfig, cpu_device_count=self.vad_cpu_cores, gpu_devices=gpu_devices,
+                                                cpu_parallel_context=self.cpu_parallel_context, gpu_parallel_context=self.gpu_parallel_context,
+                                                progress_listener=progressListener)
     def _has_parallel_devices(self):
         return (self.parallel_device_list is not None and len(self.parallel_device_list) > 0) or self.vad_cpu_cores > 1

src/vad.py CHANGED Viewed

@@ -153,84 +153,96 @@ class AbstractTranscription(ABC):
         A list of start and end timestamps, in fractional seconds.
         """
-        max_audio_duration = get_audio_duration(audio)
-        timestamp_segments = self.get_transcribe_timestamps(audio, config, 0, max_audio_duration)
-        # Get speech timestamps from full audio file
-        merged = self.get_merged_timestamps(timestamp_segments, config, max_audio_duration)
-        # A deque of transcribed segments that is passed to the next segment as a prompt
-        prompt_window = deque()
-        print("Processing timestamps:")
-        pprint(merged)
-        result = {
-            'text': "",
-            'segments': [],
-            'language': ""
-        }
-        languageCounter = Counter()
-        detected_language = None
-        segment_index = config.initial_segment_index
-        # For each time segment, run whisper
-        for segment in merged:
-            segment_index += 1
-            segment_start = segment['start']
-            segment_end = segment['end']
-            segment_expand_amount = segment.get('expand_amount', 0)
-            segment_gap = segment.get('gap', False)
-            segment_duration = segment_end - segment_start
-            if segment_duration < MIN_SEGMENT_DURATION:
-                continue
-            # Audio to run on Whisper
-            segment_audio = self.get_audio_segment(audio, start_time = str(segment_start), duration = str(segment_duration))
-            # Previous segments to use as a prompt
-            segment_prompt = ' '.join([segment['text'] for segment in prompt_window]) if len(prompt_window) > 0 else None
-            # Detected language
-            detected_language = languageCounter.most_common(1)[0][0] if len(languageCounter) > 0 else None
-            print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
-                  segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt, "language: ", detected_language)
-            scaled_progress_listener = SubTaskProgressListener(progressListener, base_task_total=max_audio_duration, sub_task_start=segment_start, sub_task_total=segment_duration)
-            segment_result = whisperCallable.invoke(segment_audio, segment_index, segment_prompt, detected_language, progress_listener=scaled_progress_listener)
-            adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
-            # Propagate expand amount to the segments
-            if (segment_expand_amount > 0):
-                segment_without_expansion = segment_duration - segment_expand_amount
-                for adjusted_segment in adjusted_segments:
-                    adjusted_segment_end = adjusted_segment['end']
-                    # Add expand amount if the segment got expanded
-                    if (adjusted_segment_end > segment_without_expansion):
-                        adjusted_segment["expand_amount"] = adjusted_segment_end - segment_without_expansion
-            # Append to output
-            result['text'] += segment_result['text']
-            result['segments'].extend(adjusted_segments)
-            # Increment detected language
-            if not segment_gap:
-                languageCounter[segment_result['language']] += 1
-            # Update prompt window
-            self.__update_prompt_window(prompt_window, adjusted_segments, segment_end, segment_gap, config)
-        if detected_language is not None:
-            result['language'] = detected_language
         return result
     def __update_prompt_window(self, prompt_window: Deque, adjusted_segments: List, segment_end: float, segment_gap: bool, config: TranscriptionConfig):
         if (config.max_prompt_window is not None and config.max_prompt_window > 0):
             # Add segments to the current prompt window (unless it is a speech gap)

         A list of start and end timestamps, in fractional seconds.
         """
+        try:
+            max_audio_duration = self.get_audio_duration(audio, config)
+            timestamp_segments = self.get_transcribe_timestamps(audio, config, 0, max_audio_duration)
+            # Get speech timestamps from full audio file
+            merged = self.get_merged_timestamps(timestamp_segments, config, max_audio_duration)
+            # A deque of transcribed segments that is passed to the next segment as a prompt
+            prompt_window = deque()
+            print("Processing timestamps:")
+            pprint(merged)
+            result = {
+                'text': "",
+                'segments': [],
+                'language': ""
+            }
+            languageCounter = Counter()
+            detected_language = None
+            segment_index = config.initial_segment_index
+            # Calculate progress
+            progress_start_offset = merged[0]['start'] if len(merged) > 0 else 0
+            progress_total_duration = sum([segment['end'] - segment['start'] for segment in merged])
+            # For each time segment, run whisper
+            for segment in merged:
+                segment_index += 1
+                segment_start = segment['start']
+                segment_end = segment['end']
+                segment_expand_amount = segment.get('expand_amount', 0)
+                segment_gap = segment.get('gap', False)
+                segment_duration = segment_end - segment_start
+                if segment_duration < MIN_SEGMENT_DURATION:
+                    continue
+                # Audio to run on Whisper
+                segment_audio = self.get_audio_segment(audio, start_time = str(segment_start), duration = str(segment_duration))
+                # Previous segments to use as a prompt
+                segment_prompt = ' '.join([segment['text'] for segment in prompt_window]) if len(prompt_window) > 0 else None
+                # Detected language
+                detected_language = languageCounter.most_common(1)[0][0] if len(languageCounter) > 0 else None
+                print("Running whisper from ", format_timestamp(segment_start), " to ", format_timestamp(segment_end), ", duration: ",
+                    segment_duration, "expanded: ", segment_expand_amount, "prompt: ", segment_prompt, "language: ", detected_language)
+                scaled_progress_listener = SubTaskProgressListener(progressListener, base_task_total=progress_total_duration,
+                                                                   sub_task_start=segment_start - progress_start_offset, sub_task_total=segment_duration)
+                segment_result = whisperCallable.invoke(segment_audio, segment_index, segment_prompt, detected_language, progress_listener=scaled_progress_listener)
+                adjusted_segments = self.adjust_timestamp(segment_result["segments"], adjust_seconds=segment_start, max_source_time=segment_duration)
+                # Propagate expand amount to the segments
+                if (segment_expand_amount > 0):
+                    segment_without_expansion = segment_duration - segment_expand_amount
+                    for adjusted_segment in adjusted_segments:
+                        adjusted_segment_end = adjusted_segment['end']
+                        # Add expand amount if the segment got expanded
+                        if (adjusted_segment_end > segment_without_expansion):
+                            adjusted_segment["expand_amount"] = adjusted_segment_end - segment_without_expansion
+                # Append to output
+                result['text'] += segment_result['text']
+                result['segments'].extend(adjusted_segments)
+                # Increment detected language
+                if not segment_gap:
+                    languageCounter[segment_result['language']] += 1
+                # Update prompt window
+                self.__update_prompt_window(prompt_window, adjusted_segments, segment_end, segment_gap, config)
+            if detected_language is not None:
+                result['language'] = detected_language
+        finally:
+            # Notify progress listener that we are done
+            if progressListener is not None:
+                progressListener.on_finished()
         return result
+    def get_audio_duration(self, audio: str, config: TranscriptionConfig):
+        return get_audio_duration(audio)
     def __update_prompt_window(self, prompt_window: Deque, adjusted_segments: List, segment_end: float, segment_gap: bool, config: TranscriptionConfig):
         if (config.max_prompt_window is not None and config.max_prompt_window > 0):
             # Add segments to the current prompt window (unless it is a speech gap)

src/vadParallel.py CHANGED Viewed

@@ -1,14 +1,33 @@
 import multiprocessing
 import threading
 import time
 from src.vad import AbstractTranscription, TranscriptionConfig, get_audio_duration
 from src.whisperContainer import WhisperCallback
-from multiprocessing import Pool
-from typing import Any, Dict, List
 import os
 class ParallelContext:
     def __init__(self, num_processes: int = None, auto_cleanup_timeout_seconds: float = None):
@@ -86,7 +105,8 @@ class ParallelTranscription(AbstractTranscription):
         super().__init__(sampling_rate=sampling_rate)
     def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig,
-                            cpu_device_count: int, gpu_devices: List[str], cpu_parallel_context: ParallelContext = None, gpu_parallel_context: ParallelContext = None):
         total_duration = get_audio_duration(audio)
         # First, get the timestamps for the original audio
@@ -108,6 +128,9 @@ class ParallelTranscription(AbstractTranscription):
         parameters = []
         segment_index = config.initial_segment_index
         for i in range(len(gpu_devices)):
             # Note that device_segment_list can be empty. But we will still create a process for it,
             # as otherwise we run the risk of assigning the same device to multiple processes.
@@ -120,7 +143,8 @@ class ParallelTranscription(AbstractTranscription):
             device_config = ParallelTranscriptionConfig(device_id, device_segment_list, segment_index, config)
             segment_index += len(device_segment_list)
-            parameters.append([audio, whisperCallable, device_config]);
         merged = {
             'text': '',
@@ -142,7 +166,24 @@ class ParallelTranscription(AbstractTranscription):
             pool = gpu_parallel_context.get_pool()
             # Run the transcription in parallel
-            results = pool.starmap(self.transcribe, parameters)
             for result in results:
                 # Merge the results
@@ -231,11 +272,12 @@ class ParallelTranscription(AbstractTranscription):
     def get_merged_timestamps(self,  timestamps: List[Dict[str, Any]], config: ParallelTranscriptionConfig, total_duration: float):
         # Override timestamps that will be processed
         if (config.override_timestamps is not None):
-            print("Using override timestamps of size " + str(len(config.override_timestamps)))
             return config.override_timestamps
         return super().get_merged_timestamps(timestamps, config, total_duration)
-    def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: ParallelTranscriptionConfig):
         # Override device ID the first time
         if (os.environ.get("INITIALIZED", None) is None):
             os.environ["INITIALIZED"] = "1"
@@ -246,7 +288,7 @@ class ParallelTranscription(AbstractTranscription):
                 print("Using device " + config.device_id)
                 os.environ["CUDA_VISIBLE_DEVICES"] = config.device_id
-        return super().transcribe(audio, whisperCallable, config)
     def _split(self, a, n):
         """Split a list into n approximately equal parts."""

 import multiprocessing
+from queue import Empty
 import threading
 import time
+from src.hooks.whisperProgressHook import ProgressListener
 from src.vad import AbstractTranscription, TranscriptionConfig, get_audio_duration
 from src.whisperContainer import WhisperCallback
+from multiprocessing import Pool, Queue
+from typing import Any, Dict, List, Union
 import os
+class _ProgressListenerToQueue(ProgressListener):
+    def __init__(self, progress_queue: Queue):
+        self.progress_queue = progress_queue
+        self.progress_total = 0
+        self.prev_progress = 0
+    def on_progress(self, current: Union[int, float], total: Union[int, float]):
+        delta = current - self.prev_progress
+        self.prev_progress = current
+        self.progress_total = total
+        self.progress_queue.put(delta)
+    def on_finished(self):
+        if self.progress_total > self.prev_progress:
+            delta = self.progress_total - self.prev_progress
+            self.progress_queue.put(delta)
+            self.prev_progress = self.progress_total
 class ParallelContext:
     def __init__(self, num_processes: int = None, auto_cleanup_timeout_seconds: float = None):
         super().__init__(sampling_rate=sampling_rate)
     def transcribe_parallel(self, transcription: AbstractTranscription, audio: str, whisperCallable: WhisperCallback, config: TranscriptionConfig,
+                            cpu_device_count: int, gpu_devices: List[str], cpu_parallel_context: ParallelContext = None, gpu_parallel_context: ParallelContext = None,
+                            progress_listener: ProgressListener = None):
         total_duration = get_audio_duration(audio)
         # First, get the timestamps for the original audio
         parameters = []
         segment_index = config.initial_segment_index
+        processing_manager = multiprocessing.Manager()
+        progress_queue = processing_manager.Queue()
         for i in range(len(gpu_devices)):
             # Note that device_segment_list can be empty. But we will still create a process for it,
             # as otherwise we run the risk of assigning the same device to multiple processes.
             device_config = ParallelTranscriptionConfig(device_id, device_segment_list, segment_index, config)
             segment_index += len(device_segment_list)
+            progress_listener_to_queue = _ProgressListenerToQueue(progress_queue)
+            parameters.append([audio, whisperCallable, device_config, progress_listener_to_queue]);
         merged = {
             'text': '',
             pool = gpu_parallel_context.get_pool()
             # Run the transcription in parallel
+            results_async = pool.starmap_async(self.transcribe, parameters)
+            total_progress = 0
+            while not results_async.ready():
+                try:
+                    delta = progress_queue.get(timeout=5)  # Set a timeout of 5 seconds
+                except Empty:
+                    continue
+                total_progress += delta
+                if progress_listener is not None:
+                    progress_listener.on_progress(total_progress, total_duration)
+            results = results_async.get()
+            # Call the finished callback
+            if progress_listener is not None:
+                progress_listener.on_finished()
             for result in results:
                 # Merge the results
     def get_merged_timestamps(self,  timestamps: List[Dict[str, Any]], config: ParallelTranscriptionConfig, total_duration: float):
         # Override timestamps that will be processed
         if (config.override_timestamps is not None):
+            print("(get_merged_timestamps) Using override timestamps of size " + str(len(config.override_timestamps)))
             return config.override_timestamps
         return super().get_merged_timestamps(timestamps, config, total_duration)
+    def transcribe(self, audio: str, whisperCallable: WhisperCallback, config: ParallelTranscriptionConfig,
+                   progressListener: ProgressListener = None):
         # Override device ID the first time
         if (os.environ.get("INITIALIZED", None) is None):
             os.environ["INITIALIZED"] = "1"
                 print("Using device " + config.device_id)
                 os.environ["CUDA_VISIBLE_DEVICES"] = config.device_id
+        return super().transcribe(audio, whisperCallable, config, progressListener)
     def _split(self, a, n):
         """Split a list into n approximately equal parts."""