Spaces:

aadnk
/

whisper-webui

Running

App Files Files Community

aadnk commited on Apr 27, 2023

Commit

f55c594

•

1 Parent(s): 764bdf1

Adding support for word timestamps

Browse files

Files changed (7) hide show

app.py +28 -12
cli.py +14 -2
config.json5 +10 -1
src/config.py +11 -1
src/utils.py +117 -8
src/vad.py +8 -0
src/whisper/whisperContainer.py +3 -2

app.py CHANGED Viewed

@@ -100,13 +100,17 @@ class WhisperTranscriber:
                                 vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
                                 initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
                                 condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
-                                compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float):
         return self.transcribe_webui_full_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
                                 vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
                                 initial_prompt, temperature, best_of, beam_size, patience, length_penalty, suppress_tokens,
                                 condition_on_previous_text, fp16, temperature_increment_on_fallback,
-                                compression_ratio_threshold, logprob_threshold, no_speech_threshold)
     # Entry function for the full tab with progress
     def transcribe_webui_full_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
@@ -114,6 +118,9 @@ class WhisperTranscriber:
                                     initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
                                     condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
                                     compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float,
                                     progress=gr.Progress()):
         # Handle temperature_increment_on_fallback
@@ -128,13 +135,15 @@ class WhisperTranscriber:
                                      initial_prompt=initial_prompt, temperature=temperature, best_of=best_of, beam_size=beam_size, patience=patience, length_penalty=length_penalty, suppress_tokens=suppress_tokens,
                                      condition_on_previous_text=condition_on_previous_text, fp16=fp16,
                                      compression_ratio_threshold=compression_ratio_threshold, logprob_threshold=logprob_threshold, no_speech_threshold=no_speech_threshold,
                                      progress=progress)
     def transcribe_webui(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
-                         vadOptions: VadOptions, progress: gr.Progress = None, **decodeOptions: dict):
         try:
             sources = self.__get_source(urlData, multipleFiles, microphoneData)
             try:
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
@@ -185,7 +194,7 @@ class WhisperTranscriber:
                     # Update progress
                     current_progress += source_audio_duration
-                    source_download, source_text, source_vtt = self.write_result(result, filePrefix, outputDirectory)
                     if len(sources) > 1:
                         # Add new line separators
@@ -359,7 +368,7 @@ class WhisperTranscriber:
         return config
-    def write_result(self, result: dict, source_name: str, output_dir: str):
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
@@ -368,8 +377,8 @@ class WhisperTranscriber:
         languageMaxLineWidth = self.__get_max_line_width(language)
         print("Max line width " + str(languageMaxLineWidth))
-        vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth)
-        srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth)
         output_files = []
         output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"));
@@ -394,13 +403,13 @@ class WhisperTranscriber:
             # 80 latin characters should fit on a 1080p/720p screen
             return 80
-    def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
         segmentStream = StringIO()
         if format == 'vtt':
-            write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
         elif format == 'srt':
-            write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
         else:
             raise Exception("Unknown format " + format)
@@ -501,7 +510,14 @@ def create_ui(app_config: ApplicationConfig):
         gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback),
         gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
         gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
-        gr.Number(label="No speech threshold", value=app_config.no_speech_threshold)
     ], outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),

                                 vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
                                 initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
                                 condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
+                                compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float,
+                                # Word timestamps
+                                word_timestamps: bool, prepend_punctuations: str,
+                                append_punctuations: str, highlight_words: bool = False):
         return self.transcribe_webui_full_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
                                 vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
                                 initial_prompt, temperature, best_of, beam_size, patience, length_penalty, suppress_tokens,
                                 condition_on_previous_text, fp16, temperature_increment_on_fallback,
+                                compression_ratio_threshold, logprob_threshold, no_speech_threshold,
+                                word_timestamps, prepend_punctuations, append_punctuations, highlight_words)
     # Entry function for the full tab with progress
     def transcribe_webui_full_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
                                     initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
                                     condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
                                     compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float,
+                                    # Word timestamps
+                                    word_timestamps: bool, prepend_punctuations: str,
+                                    append_punctuations: str, highlight_words: bool = False,
                                     progress=gr.Progress()):
         # Handle temperature_increment_on_fallback
                                      initial_prompt=initial_prompt, temperature=temperature, best_of=best_of, beam_size=beam_size, patience=patience, length_penalty=length_penalty, suppress_tokens=suppress_tokens,
                                      condition_on_previous_text=condition_on_previous_text, fp16=fp16,
                                      compression_ratio_threshold=compression_ratio_threshold, logprob_threshold=logprob_threshold, no_speech_threshold=no_speech_threshold,
+                                     word_timestamps=word_timestamps, prepend_punctuations=prepend_punctuations, append_punctuations=append_punctuations, highlight_words=highlight_words,
                                      progress=progress)
     def transcribe_webui(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
+                         vadOptions: VadOptions, progress: gr.Progress = None, highlight_words: bool = False,
+                         **decodeOptions: dict):
         try:
             sources = self.__get_source(urlData, multipleFiles, microphoneData)
             try:
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
                     # Update progress
                     current_progress += source_audio_duration
+                    source_download, source_text, source_vtt = self.write_result(result, filePrefix, outputDirectory, highlight_words)
                     if len(sources) > 1:
                         # Add new line separators
         return config
+    def write_result(self, result: dict, source_name: str, output_dir: str, highlight_words: bool = False):
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
         languageMaxLineWidth = self.__get_max_line_width(language)
         print("Max line width " + str(languageMaxLineWidth))
+        vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth, highlight_words=highlight_words)
+        srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth, highlight_words=highlight_words)
         output_files = []
         output_files.append(self.__create_file(srt, output_dir, source_name + "-subs.srt"));
             # 80 latin characters should fit on a 1080p/720p screen
             return 80
+    def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int, highlight_words: bool = False) -> str:
         segmentStream = StringIO()
         if format == 'vtt':
+            write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth, highlight_words=highlight_words)
         elif format == 'srt':
+            write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth, highlight_words=highlight_words)
         else:
             raise Exception("Unknown format " + format)
         gr.Number(label="Temperature increment on fallback", value=app_config.temperature_increment_on_fallback),
         gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
         gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
+        gr.Number(label="No speech threshold", value=app_config.no_speech_threshold),
+        # Word timestamps
+        gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps),
+        gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations),
+        gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations),
+        gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words),
     ], outputs=[
         gr.File(label="Download"),
         gr.Text(label="Transcription"),

cli.py CHANGED Viewed

@@ -95,6 +95,17 @@ def cli():
     parser.add_argument("--no_speech_threshold", type=optional_float, default=app_config.no_speech_threshold, \
                         help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
     args = parser.parse_args().__dict__
     model_name: str = args.pop("model")
     model_dir: str = args.pop("model_dir")
@@ -126,6 +137,7 @@ def cli():
     auto_parallel = args.pop("auto_parallel")
     compute_type = args.pop("compute_type")
     transcriber = WhisperTranscriber(delete_uploaded_files=False, vad_cpu_cores=vad_cpu_cores, app_config=app_config)
     transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))
@@ -133,7 +145,7 @@ def cli():
     model = create_whisper_container(whisper_implementation=whisper_implementation, model_name=model_name,
                                      device=device, compute_type=compute_type, download_root=model_dir, models=app_config.models)
     if (transcriber._has_parallel_devices()):
         print("Using parallel devices:", transcriber.parallel_device_list)
@@ -158,7 +170,7 @@ def cli():
             result = transcriber.transcribe_file(model, source_path, temperature=temperature, vadOptions=vadOptions, **args)
-            transcriber.write_result(result, source_name, output_dir)
     transcriber.close()

     parser.add_argument("--no_speech_threshold", type=optional_float, default=app_config.no_speech_threshold, \
                         help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
+    parser.add_argument("--word_timestamps", type=str2bool, default=app_config.word_timestamps,
+                        help="(experimental) extract word-level timestamps and refine the results based on them")
+    parser.add_argument("--prepend_punctuations", type=str, default=app_config.prepend_punctuations,
+                        help="if word_timestamps is True, merge these punctuation symbols with the next word")
+    parser.add_argument("--append_punctuations", type=str, default=app_config.append_punctuations,
+                        help="if word_timestamps is True, merge these punctuation symbols with the previous word")
+    parser.add_argument("--highlight_words", type=str2bool, default=app_config.highlight_words,
+                        help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt")
+    parser.add_argument("--threads", type=optional_int, default=0,
+                        help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
     args = parser.parse_args().__dict__
     model_name: str = args.pop("model")
     model_dir: str = args.pop("model_dir")
     auto_parallel = args.pop("auto_parallel")
     compute_type = args.pop("compute_type")
+    highlight_words = args.pop("highlight_words")
     transcriber = WhisperTranscriber(delete_uploaded_files=False, vad_cpu_cores=vad_cpu_cores, app_config=app_config)
     transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))
     model = create_whisper_container(whisper_implementation=whisper_implementation, model_name=model_name,
                                      device=device, compute_type=compute_type, download_root=model_dir, models=app_config.models)
     if (transcriber._has_parallel_devices()):
         print("Using parallel devices:", transcriber.parallel_device_list)
             result = transcriber.transcribe_file(model, source_path, temperature=temperature, vadOptions=vadOptions, **args)
+            transcriber.write_result(result, source_name, output_dir, highlight_words)
     transcriber.close()

config.json5 CHANGED Viewed

@@ -128,5 +128,14 @@
     // If the average log probability is lower than this value, treat the decoding as failed
     "logprob_threshold": -1.0,
     // If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence
-    "no_speech_threshold": 0.6
 }

     // If the average log probability is lower than this value, treat the decoding as failed
     "logprob_threshold": -1.0,
     // If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence
+    "no_speech_threshold": 0.6,
+    // (experimental) extract word-level timestamps and refine the results based on them
+    "word_timestamps": false,
+    // if word_timestamps is True, merge these punctuation symbols with the next word
+    "prepend_punctuations": "\"\'“¿([{-",
+    // if word_timestamps is True, merge these punctuation symbols with the previous word
+    "append_punctuations": "\"\'.。,，!！?？:：”)]}、",
+    // (requires --word_timestamps True) underline each word as it is spoken in srt and vtt
+    "highlight_words": false,
 }

src/config.py CHANGED Viewed

@@ -58,7 +58,11 @@ class ApplicationConfig:
                  condition_on_previous_text: bool = True, fp16: bool = True,
                  compute_type: str = "float16",
                  temperature_increment_on_fallback: float = 0.2, compression_ratio_threshold: float = 2.4,
-                 logprob_threshold: float = -1.0, no_speech_threshold: float = 0.6):
         self.models = models
@@ -104,6 +108,12 @@ class ApplicationConfig:
         self.logprob_threshold = logprob_threshold
         self.no_speech_threshold = no_speech_threshold
     def get_model_names(self):
         return [ x.name for x in self.models ]

                  condition_on_previous_text: bool = True, fp16: bool = True,
                  compute_type: str = "float16",
                  temperature_increment_on_fallback: float = 0.2, compression_ratio_threshold: float = 2.4,
+                 logprob_threshold: float = -1.0, no_speech_threshold: float = 0.6,
+                 # Word timestamp settings
+                 word_timestamps: bool = False, prepend_punctuations: str = "\"\'“¿([{-",
+                 append_punctuations: str = "\"\'.。,，!！?？:：”)]}、",
+                 highlight_words: bool = False):
         self.models = models
         self.logprob_threshold = logprob_threshold
         self.no_speech_threshold = no_speech_threshold
+        # Word timestamp settings
+        self.word_timestamps = word_timestamps
+        self.prepend_punctuations = prepend_punctuations
+        self.append_punctuations = append_punctuations
+        self.highlight_words = highlight_words
     def get_model_names(self):
         return [ x.name for x in self.models ]

src/utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ import unicodedata
 import re
 import zlib
-from typing import Iterator, TextIO
 import tqdm
 import urllib3
@@ -56,10 +56,14 @@ def write_txt(transcript: Iterator[dict], file: TextIO):
         print(segment['text'].strip(), file=file, flush=True)
-def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
     print("WEBVTT\n", file=file)
-    for segment in transcript:
-        text = process_text(segment['text'], maxLineWidth).replace('-->', '->')
         print(
             f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
@@ -68,8 +72,8 @@ def write_vtt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
             flush=True,
         )
-def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
     """
     Write a transcript to a file in SRT format.
     Example usage:
@@ -81,8 +85,10 @@ def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
         with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
             write_srt(result["segments"], file=srt)
     """
-    for i, segment in enumerate(transcript, start=1):
-        text = process_text(segment['text'].strip(), maxLineWidth).replace('-->', '->')
         # write srt lines
         print(
@@ -94,6 +100,109 @@ def write_srt(transcript: Iterator[dict], file: TextIO, maxLineWidth=None):
             flush=True,
         )
 def process_text(text: str, maxLineWidth=None):
     if (maxLineWidth is None or maxLineWidth < 0):
         return text

 import re
 import zlib
+from typing import Iterator, TextIO, Union
 import tqdm
 import urllib3
         print(segment['text'].strip(), file=file, flush=True)
+def write_vtt(transcript: Iterator[dict], file: TextIO,
+              maxLineWidth=None, highlight_words: bool = False):
+    iterator  = __subtitle_preprocessor_iterator(transcript, maxLineWidth, highlight_words)
     print("WEBVTT\n", file=file)
+    for segment in iterator:
+        text = segment['text'].replace('-->', '->')
         print(
             f"{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}\n"
             flush=True,
         )
+def write_srt(transcript: Iterator[dict], file: TextIO,
+              maxLineWidth=None, highlight_words: bool = False):
     """
     Write a transcript to a file in SRT format.
     Example usage:
         with open(Path(output_dir) / (audio_basename + ".srt"), "w", encoding="utf-8") as srt:
             write_srt(result["segments"], file=srt)
     """
+    iterator  = __subtitle_preprocessor_iterator(transcript, maxLineWidth, highlight_words)
+    for i, segment in enumerate(iterator, start=1):
+        text = segment['text'].replace('-->', '->')
         # write srt lines
         print(
             flush=True,
         )
+def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: int = None, highlight_words: bool = False):
+    for segment in transcript:
+        words = segment.get('words', [])
+        if len(words) == 0:
+            # Yield the segment as-is
+            if maxLineWidth is None or maxLineWidth < 0:
+                yield segment
+            # Yield the segment with processed text
+            yield {
+                'start': segment['start'],
+                'end': segment['end'],
+                'text': process_text(segment['text'].strip(), maxLineWidth)
+            }
+        subtitle_start = segment['start']
+        subtitle_end = segment['end']
+        text_words = [ this_word["word"] for this_word in words ]
+        subtitle_text = __join_words(text_words, maxLineWidth)
+        # Iterate over the words in the segment
+        if highlight_words:
+            last = subtitle_start
+            for i, this_word in enumerate(words):
+                start = this_word['start']
+                end = this_word['end']
+                if last != start:
+                    # Display the text up to this point
+                    yield {
+                        'start': last,
+                        'end': start,
+                        'text': subtitle_text
+                    }
+                # Display the text with the current word highlighted
+                yield {
+                    'start': start,
+                    'end': end,
+                    'text': __join_words(
+                        [
+                            {
+                                "word": re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word)
+                                        if j == i
+                                        else word,
+                                # The HTML tags <u> and </u> are not displayed,
+                                # # so they should not be counted in the word length
+                                "length": len(word)
+                            } for j, word in enumerate(text_words)
+                        ], maxLineWidth)
+                }
+                last = end
+            if last != subtitle_end:
+                # Display the last part of the text
+                yield {
+                    'start': last,
+                    'end': subtitle_end,
+                    'text': subtitle_text
+                }
+        # Just return the subtitle text
+        else:
+            yield {
+                'start': subtitle_start,
+                'end': subtitle_end,
+                'text': subtitle_text
+            }
+def __join_words(words: Iterator[Union[str, dict]], maxLineWidth: int = None):
+    if maxLineWidth is None or maxLineWidth < 0:
+        return " ".join(words)
+    lines = []
+    current_line = ""
+    current_length = 0
+    for entry in words:
+        # Either accept a string or a dict with a 'word' and 'length' field
+        if isinstance(entry, dict):
+            word = entry['word']
+            word_length = entry['length']
+        else:
+            word = entry
+            word_length = len(word)
+        if current_length > 0 and current_length + word_length > maxLineWidth:
+            lines.append(current_line)
+            current_line = ""
+            current_length = 0
+        current_length += word_length
+        # The word will be prefixed with a space by Whisper, so we don't need to add one here
+        current_line += word
+    if len(current_line) > 0:
+        lines.append(current_line)
+    return "\n".join(lines)
 def process_text(text: str, maxLineWidth=None):
     if (maxLineWidth is None or maxLineWidth < 0):
         return text

src/vad.py CHANGED Viewed

@@ -404,6 +404,14 @@ class AbstractTranscription(ABC):
             # Add to start and end
             new_segment['start'] = segment_start + adjust_seconds
             new_segment['end'] = segment_end + adjust_seconds
             result.append(new_segment)
         return result

             # Add to start and end
             new_segment['start'] = segment_start + adjust_seconds
             new_segment['end'] = segment_end + adjust_seconds
+            # Handle words
+            if ('words' in new_segment):
+                for word in new_segment['words']:
+                    # Adjust start and end
+                    word['start'] = word['start'] + adjust_seconds
+                    word['end'] = word['end'] + adjust_seconds
             result.append(new_segment)
         return result

src/whisper/whisperContainer.py CHANGED Viewed

@@ -203,8 +203,9 @@ class WhisperCallback(AbstractWhisperCallback):
         initial_prompt = self._get_initial_prompt(self.initial_prompt, self.initial_prompt_mode, prompt, segment_index)
-        return model.transcribe(audio, \
             language=self.language if self.language else detected_language, task=self.task, \
             initial_prompt=initial_prompt, \
             **decodeOptions
-        )

         initial_prompt = self._get_initial_prompt(self.initial_prompt, self.initial_prompt_mode, prompt, segment_index)
+        result = model.transcribe(audio, \
             language=self.language if self.language else detected_language, task=self.task, \
             initial_prompt=initial_prompt, \
             **decodeOptions
+        )
+        return result