katospiegel commited on
Commit
ad29403
·
1 Parent(s): 80334c5

faster whisper timestamp

Browse files
Files changed (2) hide show
  1. README.md +9 -1
  2. transcription.py +63 -13
README.md CHANGED
@@ -32,4 +32,12 @@ The user will logging using a password and user specified by me. That user and p
32
  ## Pricing
33
 
34
  1. Calculate the fixed cost of a server running for a long period of time.
35
- 2. Check if I can use the hibernation period to save some money.
 
 
 
 
 
 
 
 
 
32
  ## Pricing
33
 
34
  1. Calculate the fixed cost of a server running for a long period of time.
35
+ 2. Check if I can use the hibernation period to save some money.
36
+
37
+ ## Development
38
+
39
+ - [ ] Add word time-stamp
40
+ - [ ] Add Accuracy at word level
41
+ - [ ] Add mel spectrogram?
42
+ - [ ] Add Whisper parameters to the interface
43
+ - [ ]
transcription.py CHANGED
@@ -63,25 +63,66 @@ def fast_transcription(audio_file, whisper_model, language):
63
  # Transcribe audio
64
  options = dict(language=language, beam_size=5, best_of=5)
65
  transcribe_options = dict(task="transcribe", **options)
66
- segments_raw, info = model.transcribe(audio_file, **transcribe_options)
67
 
68
- # Convert back to original openai format
 
 
 
 
 
 
 
 
 
 
 
69
  segments = []
70
- i = 0
71
- for segment_chunk in segments_raw:
72
- chunk = {}
73
- chunk["start"] = segment_chunk.start
74
- chunk["end"] = segment_chunk.end
75
- chunk["text"] = segment_chunk.text
76
- segments.append(chunk)
77
- i += 1
78
- print("transcribe audio done with fast whisper")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  except Exception as e:
80
  raise RuntimeError("Error converting video to audio")
81
 
82
  #text from the list
 
 
83
 
84
- return [str(s["start"]) + " " + s["text"] for s in segments] #pd.DataFrame(segments)
85
 
86
  import datetime
87
 
@@ -215,4 +256,13 @@ def speech_to_text(audio_file, selected_source_lang, whisper_model, num_speakers
215
  return df_results, system_info, save_path
216
 
217
  except Exception as e:
218
- raise RuntimeError("Error Running inference with local model", e)
 
 
 
 
 
 
 
 
 
 
63
  # Transcribe audio
64
  options = dict(language=language, beam_size=5, best_of=5)
65
  transcribe_options = dict(task="transcribe", **options)
66
+ segments_generator, info = model.transcribe(audio_file, **transcribe_options)
67
 
68
+ #segments_raw, info = model.transcribe(audio_file, **transcribe_options)
69
+ # # Convert back to original openai format
70
+ # segments = []
71
+ # i = 0
72
+ # for segment_chunk in segments_raw:
73
+ # chunk = {}
74
+ # chunk["start"] = segment_chunk.start
75
+ # chunk["end"] = segment_chunk.end
76
+ # chunk["text"] = segment_chunk.text
77
+ # segments.append(chunk)
78
+ # i += 1
79
+ # print("transcribe audio done with fast whisper")
80
  segments = []
81
+
82
+ for segment in segments_generator:
83
+ segments.append(segment)
84
+
85
+ # if progress_listener is not None:
86
+ # progress_listener.on_progress(segment.end, info.duration)
87
+ # if verbose:
88
+ # print("[{}->{}] {}".format(format_timestamp(segment.start, True), format_timestamp(segment.end, True),
89
+ # segment.text))
90
+
91
+ text = " ".join([segment.text for segment in segments])
92
+
93
+ # Convert the segments to a format that is easier to serialize
94
+ whisper_segments = [{
95
+ "text": segment.text,
96
+ "start": segment.start,
97
+ "end": segment.end,
98
+
99
+ # Extra fields added by faster-whisper
100
+ "words": [{
101
+ "start": word.start,
102
+ "end": word.end,
103
+ "word": word.word,
104
+ "probability": word.probability
105
+ } for word in (segment.words if segment.words is not None else []) ]
106
+ } for segment in segments]
107
+
108
+ result = {
109
+ "segments": whisper_segments,
110
+ "text": text,
111
+ "language": info.language if info else None,
112
+
113
+ # Extra fields added by faster-whisper
114
+ "language_probability": info.language_probability if info else None,
115
+ "duration": info.duration if info else None
116
+ }
117
+
118
  except Exception as e:
119
  raise RuntimeError("Error converting video to audio")
120
 
121
  #text from the list
122
+
123
+ return result
124
 
125
+ #return [str(s["start"]) + " " + s["text"] for s in segments] #pd.DataFrame(segments)
126
 
127
  import datetime
128
 
 
256
  return df_results, system_info, save_path
257
 
258
  except Exception as e:
259
+ raise RuntimeError("Error Running inference with local model", e)
260
+
261
+
262
+
263
+ #######################################################################
264
+
265
+ def fast_whisper(audio_file, whisper_model="large_v2", language="es"):
266
+
267
+
268
+ return out