hivecorp commited on
Commit
8154937
·
verified ·
1 Parent(s): 937cab5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +187 -0
app.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pydub import AudioSegment
3
+ import edge_tts
4
+ import os
5
+ import asyncio
6
+ import uuid
7
+ import re
8
+
9
+ # Function to get the length of an audio file in milliseconds
10
+ def get_audio_length(audio_file):
11
+ audio = AudioSegment.from_file(audio_file)
12
+ return len(audio) / 1000 # Return in seconds for compatibility
13
+
14
+ # Function to format time for SRT in milliseconds
15
+ def format_time_ms(milliseconds):
16
+ seconds, ms = divmod(int(milliseconds), 1000)
17
+ mins, secs = divmod(seconds, 60)
18
+ hrs, mins = divmod(mins, 60)
19
+ return f"{hrs:02}:{mins:02}:{secs:02},{ms:03}"
20
+
21
+ # Function to split text into segments based on punctuation, ensuring no word is split
22
+ def split_text_into_segments(text):
23
+ segments = []
24
+ raw_segments = re.split(r'([.!?,])', text)
25
+
26
+ for i in range(0, len(raw_segments) - 1, 2):
27
+ sentence = raw_segments[i].strip() + raw_segments[i + 1]
28
+ words = sentence.split()
29
+
30
+ if len(words) <= 8:
31
+ segments.append(sentence.strip())
32
+ else:
33
+ chunk = ""
34
+ for word in words:
35
+ if len(chunk.split()) < 8:
36
+ chunk += " " + word
37
+ else:
38
+ segments.append(chunk.strip())
39
+ chunk = word
40
+ if chunk:
41
+ segments.append(chunk.strip())
42
+
43
+ if len(raw_segments) % 2 == 1:
44
+ remaining_text = raw_segments[-1].strip()
45
+ if remaining_text:
46
+ segments.append(remaining_text)
47
+
48
+ return segments
49
+
50
+ # Function to generate SRT with millisecond accuracy per batch
51
+ async def generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice):
52
+ audio_file = f"batch_{batch_num}_audio.wav"
53
+
54
+ tts = edge_tts.Communicate(batch_text, voice, rate=rate, pitch=pitch)
55
+ await tts.save(audio_file)
56
+
57
+ actual_length = get_audio_length(audio_file) * 1000 # Convert to milliseconds
58
+
59
+ segments = split_text_into_segments(batch_text)
60
+ segment_duration = actual_length / len(segments)
61
+ start_time = start_offset
62
+
63
+ srt_content = ""
64
+ for index, segment in enumerate(segments):
65
+ end_time = start_time + segment_duration
66
+
67
+ if end_time > start_offset + actual_length:
68
+ end_time = start_offset + actual_length
69
+
70
+ srt_content += f"{index + 1 + (batch_num * 100)}\n"
71
+ srt_content += f"{format_time_ms(start_time)} --> {format_time_ms(end_time)}\n"
72
+ srt_content += segment + "\n\n"
73
+
74
+ start_time = end_time
75
+
76
+ return srt_content, audio_file, start_time
77
+
78
+ # Batch processing function with millisecond accuracy
79
+ async def batch_process_srt_and_audio(script_text, pitch, rate, voice, progress=gr.Progress()):
80
+ batches = [script_text[i:i + 500] for i in range(0, len(script_text), 500)]
81
+ all_srt_content = ""
82
+ combined_audio = AudioSegment.empty()
83
+ start_offset = 0.0
84
+
85
+ for batch_num, batch_text in enumerate(batches):
86
+ srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice)
87
+ all_srt_content += srt_content
88
+
89
+ batch_audio = AudioSegment.from_file(audio_file)
90
+ combined_audio += batch_audio
91
+ start_offset = end_offset
92
+
93
+ os.remove(audio_file)
94
+ progress((batch_num + 1) / len(batches))
95
+
96
+ total_audio_length = combined_audio.duration_seconds
97
+ validated_srt_content = ""
98
+ for line in all_srt_content.strip().splitlines():
99
+ if '-->' in line:
100
+ start_str, end_str = line.split(' --> ')
101
+ start_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_str.replace(',', ':').split(':')))
102
+ end_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_str.replace(',', ':').split(':')))
103
+ if end_time > total_audio_length:
104
+ end_time = total_audio_length
105
+ line = f"{format_time_ms(start_time * 1000)} --> {format_time_ms(end_time * 1000)}"
106
+ validated_srt_content += line + "\n"
107
+
108
+ unique_id = uuid.uuid4()
109
+ final_audio_path = f"final_audio_{unique_id}.mp3"
110
+ final_srt_path = f"final_subtitles_{unique_id}.srt"
111
+
112
+ combined_audio.export(final_audio_path, format="mp3", bitrate="320k")
113
+
114
+ with open(final_srt_path, "w") as srt_file:
115
+ srt_file.write(validated_srt_content)
116
+
117
+ return final_srt_path, final_audio_path
118
+
119
+ # Gradio interface function
120
+ async def process_script(script_text, pitch, rate, voice):
121
+ pitch_str = f"{pitch}Hz" if pitch != 0 else "-1Hz"
122
+ formatted_rate = f"{'+' if rate > 1 else ''}{int(rate)}%"
123
+ srt_path, audio_path = await batch_process_srt_and_audio(script_text, pitch_str, formatted_rate, voice_options[voice])
124
+ return srt_path, audio_path, audio_path
125
+
126
+ # Gradio interface setup
127
+ voice_options = {
128
+ "Andrew Male": "en-US-AndrewNeural",
129
+ "Jenny Female": "en-US-JennyNeural",
130
+ "Guy Male": "en-US-GuyNeural",
131
+ "Ana Female": "en-US-AnaNeural",
132
+ "Aria Female": "en-US-AriaNeural",
133
+ "Brian Male": "en-US-BrianNeural",
134
+ "Christopher Male": "en-US-ChristopherNeural",
135
+ "Eric Male": "en-US-EricNeural",
136
+ "Michelle Male": "en-US-MichelleNeural",
137
+ "Roger Male": "en-US-RogerNeural",
138
+ "Natasha Female": "en-AU-NatashaNeural",
139
+ "William Male": "en-AU-WilliamNeural",
140
+ "Clara Female": "en-CA-ClaraNeural",
141
+ "Liam Female ": "en-CA-LiamNeural",
142
+ "Libby Female": "en-GB-LibbyNeural",
143
+ "Maisie": "en-GB-MaisieNeural",
144
+ "Ryan": "en-GB-RyanNeural",
145
+ "Sonia": "en-GB-SoniaNeural",
146
+ "Thomas": "en-GB-ThomasNeural",
147
+ "Sam": "en-HK-SamNeural",
148
+ "Yan": "en-HK-YanNeural",
149
+ "Connor": "en-IE-ConnorNeural",
150
+ "Emily": "en-IE-EmilyNeural",
151
+ "Neerja": "en-IN-NeerjaNeural",
152
+ "Prabhat": "en-IN-PrabhatNeural",
153
+ "Asilia": "en-KE-AsiliaNeural",
154
+ "Chilemba": "en-KE-ChilembaNeural",
155
+ "Abeo": "en-NG-AbeoNeural",
156
+ "Ezinne": "en-NG-EzinneNeural",
157
+ "Mitchell": "en-NZ-MitchellNeural",
158
+ "James": "en-PH-JamesNeural",
159
+ "Rosa": "en-PH-RosaNeural",
160
+ "Luna": "en-SG-LunaNeural",
161
+ "Wayne": "en-SG-WayneNeural",
162
+ "Elimu": "en-TZ-ElimuNeural",
163
+ "Imani": "en-TZ-ImaniNeural",
164
+ "Leah": "en-ZA-LeahNeural",
165
+ "Luke": "en-ZA-LukeNeural"
166
+ # Add other voices here...
167
+ }
168
+
169
+ app = gr.Interface(
170
+ fn=process_script,
171
+ inputs=[
172
+ gr.Textbox(label="Enter Script Text", lines=10),
173
+ gr.Slider(label="Pitch Adjustment (Hz)", minimum=-20, maximum=20, value=0, step=1),
174
+ gr.Slider(label="Rate Adjustment (%)", minimum=-50, maximum=50, value=-1, step=1),
175
+ gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Andrew Male"),
176
+ ],
177
+ outputs=[
178
+ gr.File(label="Download SRT File"),
179
+ gr.File(label="Download Audio File"),
180
+ gr.Audio(label="Audio Playback")
181
+ ],
182
+ title="WritooAI Pro Text-to-Speech with Subtitle",
183
+ description="Convert your script into Audio with Auto generated Subtitles.",
184
+ theme="compact",
185
+ )
186
+
187
+ app.launch()