|
import gradio as gr |
|
from pydub import AudioSegment |
|
import edge_tts |
|
import os |
|
import asyncio |
|
import uuid |
|
import re |
|
|
|
|
|
def get_audio_length(audio_file): |
|
audio = AudioSegment.from_file(audio_file) |
|
return len(audio) / 1000 |
|
|
|
|
|
def format_time_ms(milliseconds): |
|
seconds, ms = divmod(int(milliseconds), 1000) |
|
mins, secs = divmod(seconds, 60) |
|
hrs, mins = divmod(mins, 60) |
|
return f"{hrs:02}:{mins:02}:{secs:02},{ms:03}" |
|
|
|
|
|
def split_text_into_segments(text): |
|
segments = [] |
|
raw_segments = re.split(r'([.!?,])', text) |
|
|
|
for i in range(0, len(raw_segments) - 1, 2): |
|
sentence = raw_segments[i].strip() + raw_segments[i + 1] |
|
words = sentence.split() |
|
|
|
if len(words) <= 8: |
|
segments.append(sentence.strip()) |
|
else: |
|
chunk = "" |
|
for word in words: |
|
if len(chunk.split()) < 8: |
|
chunk += " " + word |
|
else: |
|
segments.append(chunk.strip()) |
|
chunk = word |
|
if chunk: |
|
segments.append(chunk.strip()) |
|
|
|
if len(raw_segments) % 2 == 1: |
|
remaining_text = raw_segments[-1].strip() |
|
if remaining_text: |
|
segments.append(remaining_text) |
|
|
|
return segments |
|
|
|
|
|
async def generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice): |
|
audio_file = f"batch_{batch_num}_audio.wav" |
|
|
|
tts = edge_tts.Communicate(batch_text, voice, rate=rate, pitch=pitch) |
|
await tts.save(audio_file) |
|
|
|
actual_length = get_audio_length(audio_file) * 1000 |
|
|
|
segments = split_text_into_segments(batch_text) |
|
segment_duration = actual_length / len(segments) |
|
start_time = start_offset |
|
|
|
srt_content = "" |
|
for index, segment in enumerate(segments): |
|
end_time = start_time + segment_duration |
|
|
|
if end_time > start_offset + actual_length: |
|
end_time = start_offset + actual_length |
|
|
|
srt_content += f"{index + 1 + (batch_num * 100)}\n" |
|
srt_content += f"{format_time_ms(start_time)} --> {format_time_ms(end_time)}\n" |
|
srt_content += segment + "\n\n" |
|
|
|
start_time = end_time |
|
|
|
return srt_content, audio_file, start_time |
|
|
|
|
|
async def batch_process_srt_and_audio(script_text, pitch, rate, voice, progress=gr.Progress()): |
|
batches = [script_text[i:i + 500] for i in range(0, len(script_text), 500)] |
|
all_srt_content = "" |
|
combined_audio = AudioSegment.empty() |
|
start_offset = 0.0 |
|
|
|
for batch_num, batch_text in enumerate(batches): |
|
srt_content, audio_file, end_offset = await generate_accurate_srt(batch_text, batch_num, start_offset, pitch, rate, voice) |
|
all_srt_content += srt_content |
|
|
|
batch_audio = AudioSegment.from_file(audio_file) |
|
combined_audio += batch_audio |
|
start_offset = end_offset |
|
|
|
os.remove(audio_file) |
|
progress((batch_num + 1) / len(batches)) |
|
|
|
total_audio_length = combined_audio.duration_seconds |
|
validated_srt_content = "" |
|
for line in all_srt_content.strip().splitlines(): |
|
if '-->' in line: |
|
start_str, end_str = line.split(' --> ') |
|
start_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], start_str.replace(',', ':').split(':'))) |
|
end_time = sum(x * float(t) for x, t in zip([3600, 60, 1, 0.001], end_str.replace(',', ':').split(':'))) |
|
if end_time > total_audio_length: |
|
end_time = total_audio_length |
|
line = f"{format_time_ms(start_time * 1000)} --> {format_time_ms(end_time * 1000)}" |
|
validated_srt_content += line + "\n" |
|
|
|
unique_id = uuid.uuid4() |
|
final_audio_path = f"final_audio_{unique_id}.mp3" |
|
final_srt_path = f"final_subtitles_{unique_id}.srt" |
|
|
|
combined_audio.export(final_audio_path, format="mp3", bitrate="320k") |
|
|
|
with open(final_srt_path, "w") as srt_file: |
|
srt_file.write(validated_srt_content) |
|
|
|
return final_srt_path, final_audio_path |
|
|
|
|
|
async def process_script(script_text, pitch, rate, voice): |
|
pitch_str = f"{pitch}Hz" if pitch != 0 else "-1Hz" |
|
formatted_rate = f"{'+' if rate > 1 else ''}{int(rate)}%" |
|
srt_path, audio_path = await batch_process_srt_and_audio(script_text, pitch_str, formatted_rate, voice_options[voice]) |
|
return srt_path, audio_path, audio_path |
|
|
|
|
|
voice_options = { |
|
"Andrew Male": "en-US-AndrewNeural", |
|
"Jenny Female": "en-US-JennyNeural", |
|
"Guy Male": "en-US-GuyNeural", |
|
"Ana Female": "en-US-AnaNeural", |
|
"Aria Female": "en-US-AriaNeural", |
|
"Brian Male": "en-US-BrianNeural", |
|
"Christopher Male": "en-US-ChristopherNeural", |
|
"Eric Male": "en-US-EricNeural", |
|
"Michelle Male": "en-US-MichelleNeural", |
|
"Roger Male": "en-US-RogerNeural", |
|
"Natasha Female": "en-AU-NatashaNeural", |
|
"William Male": "en-AU-WilliamNeural", |
|
"Clara Female": "en-CA-ClaraNeural", |
|
"Liam Female ": "en-CA-LiamNeural", |
|
"Libby Female": "en-GB-LibbyNeural", |
|
"Maisie": "en-GB-MaisieNeural", |
|
"Ryan": "en-GB-RyanNeural", |
|
"Sonia": "en-GB-SoniaNeural", |
|
"Thomas": "en-GB-ThomasNeural", |
|
"Sam": "en-HK-SamNeural", |
|
"Yan": "en-HK-YanNeural", |
|
"Connor": "en-IE-ConnorNeural", |
|
"Emily": "en-IE-EmilyNeural", |
|
"Neerja": "en-IN-NeerjaNeural", |
|
"Prabhat": "en-IN-PrabhatNeural", |
|
"Asilia": "en-KE-AsiliaNeural", |
|
"Chilemba": "en-KE-ChilembaNeural", |
|
"Abeo": "en-NG-AbeoNeural", |
|
"Ezinne": "en-NG-EzinneNeural", |
|
"Mitchell": "en-NZ-MitchellNeural", |
|
"James": "en-PH-JamesNeural", |
|
"Rosa": "en-PH-RosaNeural", |
|
"Luna": "en-SG-LunaNeural", |
|
"Wayne": "en-SG-WayneNeural", |
|
"Elimu": "en-TZ-ElimuNeural", |
|
"Imani": "en-TZ-ImaniNeural", |
|
"Leah": "en-ZA-LeahNeural", |
|
"Luke": "en-ZA-LukeNeural" |
|
|
|
} |
|
|
|
app = gr.Interface( |
|
fn=process_script, |
|
inputs=[ |
|
gr.Textbox(label="Enter Script Text", lines=10), |
|
gr.Slider(label="Pitch Adjustment (Hz)", minimum=-20, maximum=20, value=0, step=1), |
|
gr.Slider(label="Rate Adjustment (%)", minimum=-50, maximum=50, value=-1, step=1), |
|
gr.Dropdown(label="Select Voice", choices=list(voice_options.keys()), value="Andrew Male"), |
|
], |
|
outputs=[ |
|
gr.File(label="Download SRT File"), |
|
gr.File(label="Download Audio File"), |
|
gr.Audio(label="Audio Playback") |
|
], |
|
title="WritooAI Pro Text-to-Speech with Subtitle", |
|
description="Convert your script into Audio with Auto generated Subtitles.", |
|
theme="compact", |
|
) |
|
|
|
app.launch() |
|
|