leetuan023 commited on
Commit
52dff6c
·
verified ·
1 Parent(s): 2b962b0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +235 -0
app.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import os
4
+ import shutil
5
+ import subprocess
6
+ import tempfile
7
+ import asyncio
8
+ import edge_tts
9
+ import pysrt
10
+ import logging
11
+ import random
12
+ import gradio as gr
13
+
14
+ # Logging setup
15
+ logger = logging.getLogger(__name__)
16
+ FORMAT = "[%(asctime)s %(filename)s->%(funcName)s():%(lineno)s]%(levelname)s: %(message)s"
17
+ logging.basicConfig(format=FORMAT)
18
+
19
+ # Function for dependency check (ffmpeg, ffprobe)
20
+ def dep_check():
21
+ if not shutil.which("ffmpeg"):
22
+ raise RuntimeError("ffmpeg is not installed")
23
+ if not shutil.which("ffprobe"):
24
+ raise RuntimeError("ffprobe (part of ffmpeg) is not installed")
25
+
26
+ # Function to convert SRT time to seconds
27
+ def pysrttime_to_seconds(t):
28
+ return (t.hours * 60 + t.minutes) * 60 + t.seconds + t.milliseconds / 1000
29
+
30
+ # Get the duration of an audio/video file
31
+ def get_duration(in_file):
32
+ duration = subprocess.check_output(
33
+ [
34
+ "ffprobe",
35
+ "-v", "error",
36
+ "-show_entries", "format=duration",
37
+ "-of", "default=noprint_wrappers=1:nokey=1",
38
+ in_file,
39
+ ]
40
+ ).decode("utf-8")
41
+ return float(duration)
42
+
43
+ # Ensure the audio file matches the specified length
44
+ def ensure_audio_length(in_file, out_file, length):
45
+ duration = get_duration(in_file)
46
+ atempo = duration / length
47
+ if atempo < 0.5:
48
+ atempo = 0.5
49
+ elif atempo > 100:
50
+ atempo = 100
51
+ if atempo > 1:
52
+ retcode = subprocess.call(
53
+ [
54
+ "ffmpeg", "-y", "-i", in_file, "-filter:a", f"atempo={atempo}", out_file
55
+ ],
56
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
57
+ )
58
+ if retcode != 0:
59
+ raise subprocess.CalledProcessError(retcode, "ffmpeg")
60
+ else:
61
+ shutil.copyfile(in_file, out_file)
62
+
63
+ # Function to generate silence
64
+ def silence_gen(out_file, duration):
65
+ retcode = subprocess.call(
66
+ [
67
+ "ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc=cl=mono:r=24000", "-t", str(duration), out_file
68
+ ],
69
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
70
+ )
71
+ if retcode != 0:
72
+ raise subprocess.CalledProcessError(retcode, "ffmpeg")
73
+
74
+ # Handle enhanced SRT parameters (rate, volume, voice)
75
+ def get_enhanced_srt_params(text, arg):
76
+ text_ = text.split("\n")[-1]
77
+ if text_.startswith("edge_tts{") and text_.endswith("}"):
78
+ text_ = text_[len("edge_tts{") : -len("}")]
79
+ text_ = text_.split(",")
80
+ text_ = dict([x.split(":") for x in text_])
81
+ for x in text_.keys():
82
+ if x not in ["rate", "volume", "voice"]:
83
+ raise ValueError("edge_tts{} is invalid")
84
+ for k, v in text_.items():
85
+ arg[k] = v
86
+ return arg, "\n".join(text.split("\n")[:-1])
87
+ return arg, text
88
+
89
+ # Asynchronous audio generation
90
+ async def audio_gen(queue):
91
+ retry_count = 0
92
+ retry_limit = 5
93
+ arg = await queue.get()
94
+ fname, text, duration, enhanced_srt = arg["fname"], arg["text"], arg["duration"], arg["enhanced_srt"]
95
+
96
+ if enhanced_srt:
97
+ arg, text = get_enhanced_srt_params(text, arg)
98
+ text = " ".join(text.split("\n"))
99
+
100
+ while True:
101
+ try:
102
+ communicate = edge_tts.Communicate(text, rate=arg["rate"], volume=arg["volume"], voice=arg["voice"])
103
+ await communicate.save(fname)
104
+ except edge_tts.exceptions.NoAudioReceived:
105
+ with open(fname, "wb") as fobj:
106
+ fobj.write(b"")
107
+ except Exception as e:
108
+ if retry_count > retry_limit:
109
+ raise Exception(f"Too many retries for {fname}") from e
110
+ retry_count += 1
111
+ await asyncio.sleep(retry_count + random.randint(1, 5))
112
+ continue
113
+ break
114
+
115
+ file_length = os.path.getsize(fname)
116
+ if file_length > 0:
117
+ temporary_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
118
+ try:
119
+ ensure_audio_length(fname, temporary_file.name, duration)
120
+ finally:
121
+ temporary_file.close()
122
+ shutil.move(temporary_file.name, fname)
123
+ else:
124
+ silence_gen(fname, duration)
125
+
126
+ queue.task_done()
127
+
128
+ # Main async processing logic
129
+ async def _main(srt_data, voice, rate, volume, batch_size, enhanced_srt):
130
+ max_duration = pysrttime_to_seconds(srt_data[-1].end)
131
+ input_files = []
132
+ input_files_start_end = {}
133
+
134
+ with tempfile.TemporaryDirectory() as temp_dir:
135
+ args = []
136
+ queue = asyncio.Queue()
137
+ for i, j in enumerate(srt_data):
138
+ fname = os.path.join(temp_dir, f"{i}.mp3")
139
+ input_files.append(fname)
140
+ start = pysrttime_to_seconds(j.start)
141
+ end = pysrttime_to_seconds(j.end)
142
+ input_files_start_end[fname] = (start, end)
143
+ duration = pysrttime_to_seconds(j.duration)
144
+ args.append(
145
+ {
146
+ "fname": fname,
147
+ "text": j.text,
148
+ "rate": rate,
149
+ "volume": volume,
150
+ "voice": voice,
151
+ "duration": duration,
152
+ "enhanced_srt": enhanced_srt,
153
+ }
154
+ )
155
+
156
+ args_len = len(args)
157
+ for i in range(0, args_len, batch_size):
158
+ tasks = []
159
+ for j in range(i, min(i + batch_size, args_len)):
160
+ tasks.append(audio_gen(queue))
161
+ await queue.put(args[j])
162
+ for f in asyncio.as_completed(tasks):
163
+ await f
164
+
165
+ output_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
166
+ f = tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", delete=False)
167
+ try:
168
+ last_end = 0
169
+ for i, j in enumerate(input_files):
170
+ start = input_files_start_end[j][0]
171
+ needed = start - last_end
172
+ if needed > 0.0001:
173
+ sfname = os.path.join(temp_dir, f"silence_{i}.mp3")
174
+ silence_gen(sfname, needed)
175
+ f.write(f"file '{sfname}'\n")
176
+ last_end += get_duration(sfname)
177
+ f.write(f"file '{j}'\n")
178
+ last_end += get_duration(j)
179
+
180
+ f.flush()
181
+ f.close()
182
+
183
+ retcode = subprocess.call(
184
+ [
185
+ "ffmpeg",
186
+ "-y", "-f", "concat", "-safe", "0", "-i", f.name, "-c", "copy", output_file
187
+ ],
188
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
189
+ )
190
+ if retcode != 0:
191
+ raise subprocess.CalledProcessError(retcode, "ffmpeg")
192
+ finally:
193
+ f.close()
194
+ os.remove(f.name)
195
+
196
+ return output_file
197
+
198
+ # Gradio Interface
199
+ def process_srt_to_mp3(srt_file, voice, speed, volume, batch_size, enhanced_srt):
200
+ srt_data = pysrt.from_string(srt_file.read().decode("utf-8"))
201
+ output_file = asyncio.run(
202
+ _main(
203
+ srt_data=srt_data,
204
+ voice=voice,
205
+ rate=speed,
206
+ volume=volume,
207
+ batch_size=batch_size,
208
+ enhanced_srt=enhanced_srt
209
+ )
210
+ )
211
+ return output_file
212
+
213
+ # Gradio UI elements
214
+ def create_ui():
215
+ voice_options = ["en-US-AriaNeural", "en-US-JennyNeural"]
216
+ interface = gr.Interface(
217
+ fn=process_srt_to_mp3,
218
+ inputs=[
219
+ gr.File(label="Upload SRT File"),
220
+ gr.Dropdown(voice_options, label="Voice", value="en-US-AriaNeural"),
221
+ gr.Textbox(value="+0%", label="Speech Rate (default +0%)"),
222
+ gr.Textbox(value="+0%", label="Volume (default +0%)"),
223
+ gr.Slider(1, 100, value=50, label="Batch Size"),
224
+ gr.Checkbox(value=True, label="Enable Enhanced SRT")
225
+ ],
226
+ outputs=gr.File(label="Generated MP3 File"),
227
+ title="SRT to MP3 Converter",
228
+ description="Converts SRT files to MP3 using Edge TTS and FFmpeg"
229
+ )
230
+ return interface
231
+
232
+ # Launch Gradio interface
233
+ if __name__ == "__main__":
234
+ dep_check()
235
+ create_ui().launch()