openai/whisper-large-v3-turbo · Having a timestamp problem while trying to make a .srt file from the audio of a video

I tried some different coding methods getting help from AI coder models to make sure that this model will be able to transcribe the audio with appropriate timestamps to create a .srt, but in the end i end up with something like:
1
00:00:00,000 --> 00:00:30,000
Du zuerst, Digga. Nee, erst du. Digga geht vor. Okay, Mann. Ein Döner. Ich auch. Mach zwei Döner. Sorry, aber ich hab nur noch einen.

2
00:00:00,000 --> 00:00:30,000
Untertitelung des ZDF, 2020

3
00:00:00,000 --> 00:00:30,000
Untertitelung des ZDF, 2020

4
00:00:00,000 --> 00:00:30,000
Hey, Jungs. Wozu streiten? Bei King of Kebab gibt's doch mehr als nur Döner
...etc.

As you see it doesn't create the necessary timestamps.

Here is the code:

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
import librosa
import scipy.io.wavfile as wavfile
import numpy as np
import os
from datetime import timedelta
import re
from pydub import AudioSegment
import gc

def convert_to_wav(file_path):
    audio, sr = librosa.load(file_path, sr=16000)  # Whisper expects 16kHz sampling rate
    wav_file_path = os.path.join("/kaggle/working", os.path.splitext(os.path.basename(file_path))[0] + ".wav")
    wavfile.write(wav_file_path, sr, (audio * 32767).astype(np.int16))  # Use scipy.io.wavfile.write
    return wav_file_path

def transcribe_audio(file_path, language):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if device == "cuda" else torch.float32

    model_id = "openai/whisper-large-v3-turbo"
    model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True)
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        chunk_length_s=30,  # For long audio files, process in chunks of 30 seconds
        batch_size=24,       # Reducing batch size to save memory
        device=device
    )

    generate_kwargs = {
        "language": language,
        "task": "transcribe",
        "return_timestamps": "word"  # Use "word" for word-level timestamps
    }

    result = pipe(file_path, generate_kwargs=generate_kwargs)

    # Free up memory
    del model
    del processor
    del pipe
    gc.collect()
    torch.cuda.empty_cache()

    return result

def format_timestamp(seconds):
    td = timedelta(seconds=seconds)
    milliseconds = int(td.microseconds / 1000)
    return f"{td.seconds//3600:02d}:{(td.seconds//60)%60:02d}:{td.seconds%60:02d},{milliseconds:03d}"

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def segments_to_srt(segments):
    srt_text = ""
    for i, segment in enumerate(segments, start=1):
        start = format_timestamp(segment["start"])
        end = format_timestamp(segment["end"])
        text = clean_text(segment["text"])
        srt_text += f"{i}\n{start} --> {end}\n{text}\n\n"
    return srt_text

def handle_transcription_result(result, chunk_start_time=0):
    segments = []
    if "words" in result:
        for word_info in result["words"]:
            word = word_info.get("word", "")
            start = word_info.get("start", 0.0) + chunk_start_time
            end = word_info.get("end", 0.0) + chunk_start_time
            segments.append({
                "text": word,
                "start": start,
                "end": end
            })
    elif "chunks" in result:
        for chunk in result["chunks"]:
            text = chunk.get("text", "")
            start = chunk.get("timestamp")[0] + chunk_start_time
            end = chunk.get("timestamp")[1] + chunk_start_time
            segments.append({
                "text": text,
                "start": start,
                "end": end
            })
    elif "segments" in result:
        for segment in result["segments"]:
            text = segment.get("text", "")
            start = segment.get("start", 0.0) + chunk_start_time
            end = segment.get("end", 0.0) + chunk_start_time
            segments.append({
                "text": text,
                "start": start,
                "end": end
            })
    else:
        # Fallback: single segment with full text
        segments = [{
            "start": chunk_start_time,
            "end": chunk_start_time + 30,  # Assuming chunk length is 30 seconds
            "text": result["text"]
        }]
    return segments

def split_audio(file_path, chunk_length_ms=30000):
    audio = AudioSegment.from_file(file_path)
    chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
    return chunks

def process_chunks(chunks, language, chunk_length_ms=30000):
    results = []
    total_duration = 0
    for i, chunk in enumerate(chunks):
        chunk_file_path = f"/kaggle/working/temp_chunk_{i}.wav"
        chunk.export(chunk_file_path, format="wav")
        result = transcribe_audio(chunk_file_path, language)
        # Calculate chunk start time in seconds
        chunk_start_time = total_duration
        total_duration += chunk_length_ms / 1000.0
        # Handle transcription result with chunk time
        segments = handle_transcription_result(result, chunk_start_time=chunk_start_time)
        results.extend(segments)
        os.remove(chunk_file_path)
    return results

if __name__ == "__main__":
    # Step 1: Upload the file
    file_path = "/kaggle/input/file-to-convert/Kebab Connection (2005) Filme Deustche HD.mp3"

    # Step 2: Convert to WAV for compatibility
    wav_file_path = convert_to_wav(file_path)

    # Step 3: Set the language code for German
    language = "de"

    # Step 4: Split the audio file into chunks of 30 seconds each
    # For a 1 hour and 30 minutes audio file, this will create 180 chunks
    chunks = split_audio(wav_file_path)

    # Step 5: Transcribe each chunk
    results = process_chunks(chunks, language)

    # Step 6: Combine the results and convert to SRT
    srt_content = segments_to_srt(results)

    # Step 7: Save SRT File
    srt_filename = "/kaggle/working/output2.srt"
    with open(srt_filename, "w", encoding="utf-8") as srt_file:
        srt_file.write(srt_content)

    print(f"SRT file saved to {srt_filename}")

    #Remove temporary WAV file
    os.remove(wav_file_path)

    print("Download link for the SRT file has been provided above.")

So what's wrong here? What should be done to get the desired .srt output?