File size: 3,549 Bytes
2c66d5b
37cc811
 
 
d7633c6
2c66d5b
d7633c6
2c66d5b
 
 
 
37cc811
 
2c66d5b
 
 
37cc811
2c66d5b
 
 
 
 
 
 
 
 
 
37cc811
2c66d5b
 
37cc811
2c66d5b
37cc811
2c66d5b
 
 
 
37cc811
2c66d5b
 
 
 
 
 
37cc811
2c66d5b
 
37cc811
 
2c66d5b
 
 
 
 
 
 
 
37cc811
2c66d5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37cc811
 
2c66d5b
37cc811
2c66d5b
 
37cc811
2c66d5b
 
 
 
 
 
37cc811
 
 
2c66d5b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import torch
import librosa
import numpy as np
import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

def format_time(milliseconds):
    seconds, milliseconds = divmod(int(milliseconds), 1000)
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"

def detect_speech_activity(y, sr, frame_length=1024, hop_length=512, threshold=0.01):
    energy = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
    speech_frames = energy > threshold

    speech_regions = []
    in_speech = False
    for i, speech in enumerate(speech_frames):
        if speech and not in_speech:
            start = i
            in_speech = True
        elif not speech and in_speech:
            end = i
            speech_regions.append((start * hop_length / sr, end * hop_length / sr))
            in_speech = False

    if in_speech:
        speech_regions.append((start * hop_length / sr, len(y) / sr))

    return speech_regions

def post_process_text(text):
    text = text.replace("  ", " ")
    text = text.strip()
    return text

def transcribe_audio(audio_file):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    model_name = "Akashpb13/xlsr_kurmanji_kurdish"
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)

    y, sr = librosa.load(audio_file, sr=16000)
    voiced_segments = detect_speech_activity(y, sr, threshold=0.005)

    srt_content = ""
    for i, (start, end) in enumerate(voiced_segments, start=1):
        segment_audio = y[int(start * sr):int(end * sr)]

        input_values = processor(segment_audio, sampling_rate=sr, return_tensors="pt").input_values
        input_values = input_values.to(device)

        with torch.no_grad():
            logits = model(input_values).logits

        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]

        transcription = post_process_text(transcription)

        if transcription:
            start_time = format_time(start * 1000)
            end_time = format_time(end * 1000)

            srt_content += f"{i}\n"
            srt_content += f"{start_time} --> {end_time}\n"
            
            # Break long lines into shorter ones (max 50 characters)
            words = transcription.split()
            lines = []
            current_line = ""
            for word in words:
                if len(current_line) + len(word) > 50:
                    lines.append(current_line.strip())
                    current_line = ""
                current_line += word + " "
            if current_line:
                lines.append(current_line.strip())
            
            srt_content += "\n".join(lines) + "\n\n"

    return srt_content

def save_srt(audio_file):
    srt_content = transcribe_audio(audio_file)
    output_filename = "output.srt"
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(srt_content)
    return output_filename, srt_content

iface = gr.Interface(
    fn=save_srt,
    inputs=gr.Audio(type="filepath"),
    outputs=[
        gr.File(label="Download SRT"),
        gr.Textbox(label="SRT Content", lines=10)
    ],
    title="Kurdish Speech-to-Text Transcription",
    description="Upload an audio file to generate a SRT subtitle file with Kurdish transcription."
)

if __name__ == "__main__":
    iface.launch()