darag commited on
Commit
37cc811
1 Parent(s): d7633c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -1
app.py CHANGED
@@ -1,3 +1,72 @@
 
 
 
 
 
 
1
  import gradio as gr
 
2
 
3
- gr.Interface.load("https://89879aa8bd9cacb387.gradio.live").launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import torch
3
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
4
+ import librosa
5
+ import numpy as np
6
+ from datetime import timedelta
7
  import gradio as gr
8
+ import os
9
 
10
+ def format_time(seconds):
11
+ td = timedelta(seconds=seconds)
12
+ hours, remainder = divmod(td.seconds, 3600)
13
+ minutes, seconds = divmod(remainder, 60)
14
+ milliseconds = td.microseconds // 1000
15
+ return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
16
+
17
+ def estimate_word_timings(transcription, total_duration):
18
+ words = transcription.split()
19
+ total_chars = sum(len(word) for word in words)
20
+ char_duration = total_duration / total_chars
21
+
22
+ word_timings = []
23
+ current_time = 0
24
+
25
+ for word in words:
26
+ word_duration = len(word) * char_duration
27
+ start_time = current_time
28
+ end_time = current_time + word_duration
29
+ word_timings.append((word, start_time, end_time))
30
+ current_time = end_time
31
+
32
+ return word_timings
33
+
34
+ model_name = "Akashpb13/xlsr_kurmanji_kurdish"
35
+ model = Wav2Vec2ForCTC.from_pretrained(model_name)
36
+ processor = Wav2Vec2Processor.from_pretrained(model_name)
37
+
38
+ def transcribe_audio(file):
39
+ speech, rate = librosa.load(file, sr=16000)
40
+ input_values = processor(speech, return_tensors="pt", sampling_rate=rate).input_values
41
+
42
+ with torch.no_grad():
43
+ logits = model(input_values).logits
44
+
45
+ predicted_ids = torch.argmax(logits, dim=-1)
46
+ transcription = processor.batch_decode(predicted_ids)[0]
47
+ total_duration = len(speech) / rate
48
+ word_timings = estimate_word_timings(transcription, total_duration)
49
+
50
+ srt_content = ""
51
+ for i, (word, start_time, end_time) in enumerate(word_timings, start=1):
52
+ start_time_str = format_time(start_time)
53
+ end_time_str = format_time(end_time)
54
+ srt_content += f"{i}\n{start_time_str} --> {end_time_str}\n{word}\n\n"
55
+
56
+ output_filename = "output_word_by_word.srt"
57
+ with open(output_filename, "w", encoding="utf-8") as f:
58
+ f.write(srt_content)
59
+
60
+ return transcription, output_filename
61
+
62
+ interface = gr.Interface(
63
+ fn=transcribe_audio,
64
+ inputs=gr.Audio(type="filepath"),
65
+ outputs=[gr.Textbox(label="Transcription"), gr.File(label="Download SRT File")],
66
+ title="Deng --- Nivîsandin ::: Kurdî-Kurmancî",
67
+ description="Dengê xwe ji me re rêke û li Submit bixe ... û bila bêhna te fireh be .",
68
+ article="By Derax Elî"
69
+ )
70
+
71
+ if __name__ == "__main__":
72
+ interface.launch()