import json import gradio as gr from faster_whisper import WhisperModel # Assuming you have installed this library def split_text_into_lines(data): MaxChars = 30 MaxDuration = 2.5 MaxGap = 1.5 subtitles = [] line = [] line_duration = 0 for idx, word_data in enumerate(data): word = word_data["word"] start = word_data["start"] end = word_data["end"] line.append(word_data) line_duration += end - start temp = " ".join(item["word"] for item in line) duration_exceeded = line_duration > MaxDuration chars_exceeded = len(temp) > MaxChars maxgap_exceeded = (word_data['start'] - data[idx - 1]['end']) > MaxGap if idx > 0 else False if duration_exceeded or chars_exceeded or maxgap_exceeded: if line: subtitle_line = { "word": temp, "start": line[0]["start"], "end": line[-1]["end"], "textcontents": line } subtitles.append(subtitle_line) line = [] line_duration = 0 if line: subtitle_line = { "word": " ".join(item["word"] for item in line), "start": line[0]["start"], "end": line[-1]["end"], "textcontents": line } subtitles.append(subtitle_line) return subtitles def transcribe_audio(audiofilename): model_size = "medium" model = WhisperModel(model_size) segments, info = model.transcribe(audiofilename, word_timestamps=True) segments = list(segments) # The transcription will actually run here. wordlevel_info = [] for segment in segments: for word in segment.words: wordlevel_info.append({'word': word.word, 'start': word.start, 'end': word.end}) linelevel_subtitles = split_text_into_lines(wordlevel_info) return linelevel_subtitles def audio_transcription(audiofile): transcription = transcribe_audio(audiofile.name) return transcription inputs = gr.inputs.File(label="Upload Audio File", type="audio") outputs = gr.outputs.Json(label="Transcription Output") title = "Audio Transcription" description = "Upload an audio file and get the transcription in JSON format." gr.Interface(fn=audio_transcription, inputs=inputs, outputs=outputs, title=title, description=description).launch()