Having a timestamp problem while trying to make a .srt file from the audio of a video
#52
by
muratowski
- opened
I tried some different coding methods getting help from AI coder models to make sure that this model will be able to transcribe the audio with appropriate timestamps to create a .srt, but in the end i end up with something like:
1
00:00:00,000 --> 00:00:30,000
Du zuerst, Digga. Nee, erst du. Digga geht vor. Okay, Mann. Ein Döner. Ich auch. Mach zwei Döner. Sorry, aber ich hab nur noch einen.
2
00:00:00,000 --> 00:00:30,000
Untertitelung des ZDF, 2020
3
00:00:00,000 --> 00:00:30,000
Untertitelung des ZDF, 2020
4
00:00:00,000 --> 00:00:30,000
Hey, Jungs. Wozu streiten? Bei King of Kebab gibt's doch mehr als nur Döner
...etc.
As you see it doesn't create the necessary timestamps.
Here is the code:
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import torch
import librosa
import scipy.io.wavfile as wavfile
import numpy as np
import os
from datetime import timedelta
import re
from pydub import AudioSegment
import gc
def convert_to_wav(file_path):
audio, sr = librosa.load(file_path, sr=16000) # Whisper expects 16kHz sampling rate
wav_file_path = os.path.join("/kaggle/working", os.path.splitext(os.path.basename(file_path))[0] + ".wav")
wavfile.write(wav_file_path, sr, (audio * 32767).astype(np.int16)) # Use scipy.io.wavfile.write
return wav_file_path
def transcribe_audio(file_path, language):
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if device == "cuda" else torch.float32
model_id = "openai/whisper-large-v3-turbo"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
chunk_length_s=30, # For long audio files, process in chunks of 30 seconds
batch_size=24, # Reducing batch size to save memory
device=device
)
generate_kwargs = {
"language": language,
"task": "transcribe",
"return_timestamps": "word" # Use "word" for word-level timestamps
}
result = pipe(file_path, generate_kwargs=generate_kwargs)
# Free up memory
del model
del processor
del pipe
gc.collect()
torch.cuda.empty_cache()
return result
def format_timestamp(seconds):
td = timedelta(seconds=seconds)
milliseconds = int(td.microseconds / 1000)
return f"{td.seconds//3600:02d}:{(td.seconds//60)%60:02d}:{td.seconds%60:02d},{milliseconds:03d}"
def clean_text(text):
return re.sub(r'\s+', ' ', text).strip()
def segments_to_srt(segments):
srt_text = ""
for i, segment in enumerate(segments, start=1):
start = format_timestamp(segment["start"])
end = format_timestamp(segment["end"])
text = clean_text(segment["text"])
srt_text += f"{i}\n{start} --> {end}\n{text}\n\n"
return srt_text
def handle_transcription_result(result, chunk_start_time=0):
segments = []
if "words" in result:
for word_info in result["words"]:
word = word_info.get("word", "")
start = word_info.get("start", 0.0) + chunk_start_time
end = word_info.get("end", 0.0) + chunk_start_time
segments.append({
"text": word,
"start": start,
"end": end
})
elif "chunks" in result:
for chunk in result["chunks"]:
text = chunk.get("text", "")
start = chunk.get("timestamp")[0] + chunk_start_time
end = chunk.get("timestamp")[1] + chunk_start_time
segments.append({
"text": text,
"start": start,
"end": end
})
elif "segments" in result:
for segment in result["segments"]:
text = segment.get("text", "")
start = segment.get("start", 0.0) + chunk_start_time
end = segment.get("end", 0.0) + chunk_start_time
segments.append({
"text": text,
"start": start,
"end": end
})
else:
# Fallback: single segment with full text
segments = [{
"start": chunk_start_time,
"end": chunk_start_time + 30, # Assuming chunk length is 30 seconds
"text": result["text"]
}]
return segments
def split_audio(file_path, chunk_length_ms=30000):
audio = AudioSegment.from_file(file_path)
chunks = [audio[i:i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
return chunks
def process_chunks(chunks, language, chunk_length_ms=30000):
results = []
total_duration = 0
for i, chunk in enumerate(chunks):
chunk_file_path = f"/kaggle/working/temp_chunk_{i}.wav"
chunk.export(chunk_file_path, format="wav")
result = transcribe_audio(chunk_file_path, language)
# Calculate chunk start time in seconds
chunk_start_time = total_duration
total_duration += chunk_length_ms / 1000.0
# Handle transcription result with chunk time
segments = handle_transcription_result(result, chunk_start_time=chunk_start_time)
results.extend(segments)
os.remove(chunk_file_path)
return results
if __name__ == "__main__":
# Step 1: Upload the file
file_path = "/kaggle/input/file-to-convert/Kebab Connection (2005) Filme Deustche HD.mp3"
# Step 2: Convert to WAV for compatibility
wav_file_path = convert_to_wav(file_path)
# Step 3: Set the language code for German
language = "de"
# Step 4: Split the audio file into chunks of 30 seconds each
# For a 1 hour and 30 minutes audio file, this will create 180 chunks
chunks = split_audio(wav_file_path)
# Step 5: Transcribe each chunk
results = process_chunks(chunks, language)
# Step 6: Combine the results and convert to SRT
srt_content = segments_to_srt(results)
# Step 7: Save SRT File
srt_filename = "/kaggle/working/output2.srt"
with open(srt_filename, "w", encoding="utf-8") as srt_file:
srt_file.write(srt_content)
print(f"SRT file saved to {srt_filename}")
#Remove temporary WAV file
os.remove(wav_file_path)
print("Download link for the SRT file has been provided above.")
- So what's wrong here? What should be done to get the desired .srt output?