hivecorp commited on
Commit
18dfe86
·
verified ·
1 Parent(s): 7ad2a01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -59
app.py CHANGED
@@ -2,13 +2,15 @@ import gradio as gr
2
  import edge_tts
3
  import asyncio
4
  import tempfile
 
5
  from moviepy.editor import AudioFileClip
6
- import re
7
 
 
8
  async def get_voices():
9
  voices = await edge_tts.list_voices()
10
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
11
 
 
12
  async def text_to_speech(text, voice, rate, pitch):
13
  if not text.strip():
14
  return None, gr.Warning("Please enter the text to convert.")
@@ -19,83 +21,104 @@ async def text_to_speech(text, voice, rate, pitch):
19
  rate_str = f"{rate:+d}%"
20
  pitch_str = f"{pitch:+d}Hz"
21
  communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
22
-
23
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
24
  tmp_path = tmp_file.name
25
  await communicate.save(tmp_path)
26
-
27
  return tmp_path, None
28
 
29
- def generate_srt(text, audio_duration, max_words_per_line):
30
- # Eliminate extra spaces and split into words
31
- text = re.sub(r'\s+', ' ', text.strip())
32
- words = text.split()
33
- srt_lines = []
34
- current_line = []
35
- total_words = len(words)
36
-
37
- for i, word in enumerate(words):
38
- current_line.append(word)
39
- # Create a line if we reach the max words per line or at the end of the text
40
- if len(current_line) >= max_words_per_line or i == total_words - 1:
41
- line_text = ' '.join(current_line)
42
- # Adjust duration proportionally
43
- duration = audio_duration * (len(current_line) / total_words)
44
- start_time = (sum(len(' '.join(srt_lines[j].split()[2:])) for j in range(len(srt_lines))) / total_words) * audio_duration if srt_lines else 0
45
-
46
- end_time = start_time + duration
47
 
48
- # Formatting time for SRT
49
- start_time_str = f"{int(start_time // 3600):02}:{int((start_time % 3600) // 60):02}:{int(start_time % 60):02},{int((start_time % 1) * 1000):03}"
50
- end_time_str = f"{int(end_time // 3600):02}:{int((end_time % 3600) // 60):02}:{int(end_time % 60):02},{int((end_time % 1) * 1000):03}"
51
 
52
- srt_lines.append(f"{len(srt_lines) + 1}\n{start_time_str} --> {end_time_str}\n{line_text}\n")
53
- current_line = []
54
-
55
- return ''.join(srt_lines)
56
 
57
- def tts_interface(text, voice, rate, pitch, max_words_per_line):
58
- audio_path, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
 
 
 
 
 
 
 
 
 
 
59
  if warning:
60
  return None, None, warning
 
 
 
61
 
62
- # Calculate audio duration
63
- audio_duration = AudioFileClip(audio_path).duration # Get duration in seconds
64
-
65
- # Generate SRT file
66
- srt_content = generate_srt(text, audio_duration, max_words_per_line)
67
- srt_path = audio_path.replace('.mp3', '_subtitle.srt')
68
-
69
- with open(srt_path, 'w') as f:
70
- f.write(srt_content)
71
 
72
  return audio_path, srt_path, None
73
 
 
 
 
 
 
 
74
  async def create_demo():
75
  voices = await get_voices()
76
 
77
- demo = gr.Interface(
78
- fn=tts_interface,
79
- inputs=[
80
- gr.Textbox(label="Input Text", lines=5),
81
- gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
82
- gr.Slider(minimum=-50, maximum=50, value=0, label="Rate Adjustment (%)", step=1),
83
- gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1),
84
- gr.Slider(minimum=3, maximum=8, value=5, label="Max Words per Line", step=1),
85
- ],
86
- outputs=[
87
- gr.Audio(label="Generated Audio", type="filepath"),
88
- gr.File(label="Generated Subtitle (.srt)"),
89
- gr.Markdown(label="Warning", visible=False)
90
- ],
91
- title="Edge TTS Text to Speech with SRT",
92
- description="Convert text to speech and generate synchronized subtitles based on speech rate.",
93
- analytics_enabled=False,
94
- allow_flagging=False,
95
- )
96
-
 
 
 
 
 
 
 
 
 
 
 
97
  return demo
98
 
 
99
  if __name__ == "__main__":
100
  demo = asyncio.run(create_demo())
101
  demo.launch()
 
2
  import edge_tts
3
  import asyncio
4
  import tempfile
5
+ import os
6
  from moviepy.editor import AudioFileClip
 
7
 
8
+ # Get all available voices
9
  async def get_voices():
10
  voices = await edge_tts.list_voices()
11
  return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
12
 
13
+ # Text to speech functionality
14
  async def text_to_speech(text, voice, rate, pitch):
15
  if not text.strip():
16
  return None, gr.Warning("Please enter the text to convert.")
 
21
  rate_str = f"{rate:+d}%"
22
  pitch_str = f"{pitch:+d}Hz"
23
  communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
 
24
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
25
  tmp_path = tmp_file.name
26
  await communicate.save(tmp_path)
 
27
  return tmp_path, None
28
 
29
+ # Generate SRT file with specified lines of subtitles
30
+ def generate_srt(words, audio_duration, srt_path, num_lines):
31
+ with open(srt_path, 'w', encoding='utf-8') as srt_file:
32
+ segment_duration = audio_duration / (len(words) // (5 * num_lines)) # Average duration for each segment
33
+ current_time = 0
34
+
35
+ for i in range(0, len(words), 5 * num_lines): # Adjusting for the number of lines
36
+ lines = []
37
+ for j in range(num_lines):
38
+ line = ' '.join(words[i + j * 5:i + (j + 1) * 5]) # 5 words per line
39
+ if line:
40
+ lines.append(line)
41
+
42
+ start_time = current_time
43
+ end_time = start_time + segment_duration # Adjust duration for the current segment
 
 
 
44
 
45
+ start_time_str = format_srt_time(start_time)
46
+ end_time_str = format_srt_time(end_time)
47
+ srt_file.write(f"{i // (5 * num_lines) + 1}\n{start_time_str} --> {end_time_str}\n" + "\n".join(lines) + "\n\n")
48
 
49
+ current_time += segment_duration # Update current time for the next segment
50
+
51
+ return srt_path
 
52
 
53
+ def format_srt_time(seconds):
54
+ millis = int((seconds - int(seconds)) * 1000)
55
+ seconds = int(seconds)
56
+ minutes = seconds // 60
57
+ hours = minutes // 60
58
+ minutes %= 60
59
+ seconds %= 60
60
+ return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"
61
+
62
+ # Text to audio and SRT functionality
63
+ async def text_to_audio_and_srt(text, voice, rate, pitch, num_lines):
64
+ audio_path, warning = await text_to_speech(text, voice, rate, pitch)
65
  if warning:
66
  return None, None, warning
67
+
68
+ audio_clip = AudioFileClip(audio_path)
69
+ audio_duration = audio_clip.duration
70
 
71
+ # Generate SRT file based on the entire text
72
+ base_name = os.path.splitext(audio_path)[0]
73
+ srt_path = f"{base_name}_subtitle.srt"
74
+ words = text.split()
75
+ generate_srt(words, audio_duration, srt_path, num_lines)
 
 
 
 
76
 
77
  return audio_path, srt_path, None
78
 
79
+ # Gradio interface function
80
+ def tts_interface(text, voice, rate, pitch, num_lines):
81
+ audio_path, srt_path, warning = asyncio.run(text_to_audio_and_srt(text, voice, rate, pitch, num_lines))
82
+ return audio_path, srt_path, warning
83
+
84
+ # Create Gradio app
85
  async def create_demo():
86
  voices = await get_voices()
87
 
88
+ with gr.Blocks() as demo:
89
+ gr.Markdown(
90
+ """
91
+ <h1 style="text-align: center; color: #333;">Text to Speech with Subtitles</h1>
92
+ <p style="text-align: center; color: #555;">Convert your text to natural-sounding speech and generate subtitles (SRT) for your audio.</p>
93
+ """,
94
+ elem_id="header"
95
+ )
96
+
97
+ with gr.Row():
98
+ with gr.Column():
99
+ text_input = gr.Textbox(label="Input Text", lines=5, placeholder="Enter text here...")
100
+ voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value="")
101
+ rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Rate Adjustment (%)", step=1)
102
+ pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
103
+
104
+ num_lines_slider = gr.Slider(minimum=1, maximum=5, value=2, label="Number of SRT Lines", step=1)
105
+
106
+ generate_button = gr.Button("Generate Audio and Subtitles", variant="primary")
107
+
108
+ with gr.Column():
109
+ output_audio = gr.Audio(label="Generated Audio", type="filepath")
110
+ output_srt = gr.File(label="Generated SRT", file_count="single")
111
+ warning_msg = gr.Markdown(label="Warning", visible=False)
112
+
113
+ generate_button.click(
114
+ fn=tts_interface,
115
+ inputs=[text_input, voice_dropdown, rate_slider, pitch_slider, num_lines_slider],
116
+ outputs=[output_audio, output_srt, warning_msg]
117
+ )
118
+
119
  return demo
120
 
121
+ # Run the app
122
  if __name__ == "__main__":
123
  demo = asyncio.run(create_demo())
124
  demo.launch()