import os import torch import librosa import numpy as np import gradio as gr from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor def format_time(milliseconds): seconds, milliseconds = divmod(int(milliseconds), 1000) minutes, seconds = divmod(seconds, 60) hours, minutes = divmod(minutes, 60) return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" def detect_speech_activity(y, sr, frame_length=1024, hop_length=512, threshold=0.01): energy = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0] speech_frames = energy > threshold speech_regions = [] in_speech = False for i, speech in enumerate(speech_frames): if speech and not in_speech: start = i in_speech = True elif not speech and in_speech: end = i speech_regions.append((start * hop_length / sr, end * hop_length / sr)) in_speech = False if in_speech: speech_regions.append((start * hop_length / sr, len(y) / sr)) return speech_regions def post_process_text(text): text = text.replace(" ", " ") text = text.strip() return text def transcribe_audio(audio_file): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model_name = "Akashpb13/xlsr_kurmanji_kurdish" processor = Wav2Vec2Processor.from_pretrained(model_name) model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device) y, sr = librosa.load(audio_file, sr=16000) voiced_segments = detect_speech_activity(y, sr, threshold=0.005) srt_content = "" for i, (start, end) in enumerate(voiced_segments, start=1): segment_audio = y[int(start * sr):int(end * sr)] input_values = processor(segment_audio, sampling_rate=sr, return_tensors="pt").input_values input_values = input_values.to(device) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] transcription = post_process_text(transcription) if transcription: start_time = format_time(start * 1000) end_time = format_time(end * 1000) srt_content += f"{i}\n" srt_content += f"{start_time} --> {end_time}\n" # Break long lines into shorter ones (max 50 characters) words = transcription.split() lines = [] current_line = "" for word in words: if len(current_line) + len(word) > 50: lines.append(current_line.strip()) current_line = "" current_line += word + " " if current_line: lines.append(current_line.strip()) srt_content += "\n".join(lines) + "\n\n" return srt_content def save_srt(audio_file): srt_content = transcribe_audio(audio_file) output_filename = "output.srt" with open(output_filename, "w", encoding="utf-8") as f: f.write(srt_content) return output_filename, srt_content iface = gr.Interface( fn=save_srt, inputs=gr.Audio(type="filepath"), outputs=[ gr.File(label="Download SRT"), gr.Textbox(label="SRT Content", lines=10) ], title="Kurdish Speech-to-Text Transcription", description="Upload an audio file to generate a SRT subtitle file with Kurdish transcription." ) if __name__ == "__main__": iface.launch()