Spaces:

katospiegel
/

amanu

Runtime error

App Files Files Community

katospiegel commited on Oct 15, 2023

Commit

ad29403

1 Parent(s): 80334c5

faster whisper timestamp

Browse files

Files changed (2) hide show

README.md +9 -1
transcription.py +63 -13

README.md CHANGED Viewed

@@ -32,4 +32,12 @@ The user will logging using a password and user specified by me. That user and p
 ## Pricing
 1. Calculate the fixed cost of a server running for a long period of time.
-2. Check if I can use the hibernation period to save some money.

 ## Pricing
 1. Calculate the fixed cost of a server running for a long period of time.
+2. Check if I can use the hibernation period to save some money.
+## Development
+- [ ] Add word time-stamp
+- [ ] Add Accuracy at word level
+- [ ] Add mel spectrogram?
+- [ ] Add Whisper parameters to the interface
+- [ ]

transcription.py CHANGED Viewed

@@ -63,25 +63,66 @@ def fast_transcription(audio_file, whisper_model, language):
         # Transcribe audio
         options = dict(language=language, beam_size=5, best_of=5)
         transcribe_options = dict(task="transcribe", **options)
-        segments_raw, info = model.transcribe(audio_file, **transcribe_options)
-        # Convert back to original openai format
         segments = []
-        i = 0
-        for segment_chunk in segments_raw:
-            chunk = {}
-            chunk["start"] = segment_chunk.start
-            chunk["end"] = segment_chunk.end
-            chunk["text"] = segment_chunk.text
-            segments.append(chunk)
-            i += 1
-        print("transcribe audio done with fast whisper")
     except Exception as e:
         raise RuntimeError("Error converting video to audio")
     #text from the list
-    return [str(s["start"]) + " " + s["text"] for s in segments] #pd.DataFrame(segments)
 import datetime
@@ -215,4 +256,13 @@ def speech_to_text(audio_file, selected_source_lang, whisper_model, num_speakers
         return df_results, system_info, save_path
     except Exception as e:
-        raise RuntimeError("Error Running inference with local model", e)

         # Transcribe audio
         options = dict(language=language, beam_size=5, best_of=5)
         transcribe_options = dict(task="transcribe", **options)
+        segments_generator, info = model.transcribe(audio_file, **transcribe_options)
+        #segments_raw, info = model.transcribe(audio_file, **transcribe_options)
+        # # Convert back to original openai format
+        # segments = []
+        # i = 0
+        # for segment_chunk in segments_raw:
+        #     chunk = {}
+        #     chunk["start"] = segment_chunk.start
+        #     chunk["end"] = segment_chunk.end
+        #     chunk["text"] = segment_chunk.text
+        #     segments.append(chunk)
+        #     i += 1
+        # print("transcribe audio done with fast whisper")
         segments = []
+        for segment in segments_generator:
+            segments.append(segment)
+            # if progress_listener is not None:
+            #     progress_listener.on_progress(segment.end, info.duration)
+            # if verbose:
+            #     print("[{}->{}] {}".format(format_timestamp(segment.start, True), format_timestamp(segment.end, True),
+            #                               segment.text))
+        text = " ".join([segment.text for segment in segments])
+        # Convert the segments to a format that is easier to serialize
+        whisper_segments = [{
+            "text": segment.text,
+            "start": segment.start,
+            "end": segment.end,
+            # Extra fields added by faster-whisper
+            "words": [{
+                "start": word.start,
+                "end": word.end,
+                "word": word.word,
+                "probability": word.probability
+            } for word in (segment.words if segment.words is not None else []) ]
+        } for segment in segments]
+        result = {
+            "segments": whisper_segments,
+            "text": text,
+            "language": info.language if info else None,
+            # Extra fields added by faster-whisper
+            "language_probability": info.language_probability if info else None,
+            "duration": info.duration if info else None
+        }
     except Exception as e:
         raise RuntimeError("Error converting video to audio")
     #text from the list
+    return result
+    #return [str(s["start"]) + " " + s["text"] for s in segments] #pd.DataFrame(segments)
 import datetime
         return df_results, system_info, save_path
     except Exception as e:
+        raise RuntimeError("Error Running inference with local model", e)
+#######################################################################
+def fast_whisper(audio_file, whisper_model="large_v2", language="es"):
+    return out