11Labs-TTS-Free-VC-1

Runtime error

App Files Files Community

kevinwang676 commited on May 15, 2024

Commit

53bd66a

verified ·

1 Parent(s): ada57ff

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -25

app.py CHANGED Viewed

@@ -157,16 +157,16 @@ class subtitle:
     def normalize(self,ntype:str,fps=30):
          if ntype=="prcsv":
               h,m,s,fs=(self.start_time.replace(';',':')).split(":")#seconds
-              self.start_time=int(h)*3600+int(m)*60+int(s)+round(int(fs)/fps,2)
               h,m,s,fs=(self.end_time.replace(';',':')).split(":")
-              self.end_time=int(h)*3600+int(m)*60+int(s)+round(int(fs)/fps,2)
          elif ntype=="srt":
              h,m,s=self.start_time.split(":")
              s=s.replace(",",".")
-             self.start_time=int(h)*3600+int(m)*60+round(float(s),2)
              h,m,s=self.end_time.split(":")
              s=s.replace(",",".")
-             self.end_time=int(h)*3600+int(m)*60+round(float(s),2)
          else:
              raise ValueError
     def add_offset(self,offset=0):
@@ -217,7 +217,41 @@ def read_srt(uploaded_file):
     subtitle_list.append(st)
     return subtitle_list
 from pydub import AudioSegment
 def trim_audio(intervals, input_file_path, output_file_path):
     # load the audio file
@@ -227,21 +261,20 @@ def trim_audio(intervals, input_file_path, output_file_path):
     for i, (start_time, end_time) in enumerate(intervals):
         # extract the segment of the audio
         segment = audio[start_time*1000:end_time*1000]
-        output_file_path_i = f"{output_file_path}_{i}.wav"
-        if len(segment) < 3000:
-            # Calculate how many times to repeat the audio to make it at least 2 seconds long
-            repeat_count = (3000 // len(segment)) + 2
             # Repeat the audio
             longer_audio = segment * repeat_count
             # Save the extended audio
-            print(f"Audio was less than 3 seconds. Extended to {len(longer_audio)} milliseconds.")
             longer_audio.export(output_file_path_i, format='wav')
         else:
-            print("Audio is already 3 seconds or longer.")
-            segment.export(output_file_path_i, format='wav')
 import re
@@ -286,19 +319,24 @@ def convert_from_srt(apikey, filename, audio_full, voice, multilingual):
         shutil.rmtree("output")
     if multilingual==False:
         for i in subtitle_list:
-            os.makedirs("output", exist_ok=True)
-            trim_audio([[i.start_time, i.end_time]], audio_full, f"sliced_audio_{i.index}")
-            print(f"正在合成第{i.index}条语音")
-            print(f"语音内容：{i.text}")
-            convert(apikey, i.text, f"sliced_audio_{i.index}_0.wav", voice, i.text + " " + str(i.index))
     else:
         for i in subtitle_list:
-            os.makedirs("output", exist_ok=True)
-            trim_audio([[i.start_time, i.end_time]], audio_full, f"sliced_audio_{i.index}")
-            print(f"正在合成第{i.index}条语音")
-            print(f"语音内容：{i.text.splitlines()[1]}")
-            convert(apikey, i.text.splitlines()[1], f"sliced_audio_{i.index}_0.wav", voice, i.text.splitlines()[1] + " " + str(i.index))
     merge_audios("output")
     return "AI配音版.wav"
@@ -334,4 +372,4 @@ with gr.Blocks() as app:
         </div>
     ''')
-app.launch(show_error=True)

     def normalize(self,ntype:str,fps=30):
          if ntype=="prcsv":
               h,m,s,fs=(self.start_time.replace(';',':')).split(":")#seconds
+              self.start_time=int(h)*3600+int(m)*60+int(s)+round(int(fs)/fps,5)
               h,m,s,fs=(self.end_time.replace(';',':')).split(":")
+              self.end_time=int(h)*3600+int(m)*60+int(s)+round(int(fs)/fps,5)
          elif ntype=="srt":
              h,m,s=self.start_time.split(":")
              s=s.replace(",",".")
+             self.start_time=int(h)*3600+int(m)*60+round(float(s),5)
              h,m,s=self.end_time.split(":")
              s=s.replace(",",".")
+             self.end_time=int(h)*3600+int(m)*60+round(float(s),5)
          else:
              raise ValueError
     def add_offset(self,offset=0):
     subtitle_list.append(st)
     return subtitle_list
+import webrtcvad
 from pydub import AudioSegment
+from pydub.utils import make_chunks
+def vad(audio_name, out_path_name):
+  audio = AudioSegment.from_file(audio_name, format="wav")
+  # Set the desired sample rate (WebRTC VAD supports only 8000, 16000, 32000, or 48000 Hz)
+  audio = audio.set_frame_rate(48000)
+  # Set single channel (mono)
+  audio = audio.set_channels(1)
+  # Initialize VAD
+  vad = webrtcvad.Vad()
+  # Set aggressiveness mode (an integer between 0 and 3, 3 is the most aggressive)
+  vad.set_mode(3)
+  # Convert pydub audio to bytes
+  frame_duration = 30  # Duration of a frame in ms
+  frame_width = int(audio.frame_rate * frame_duration / 1000)  # width of a frame in samples
+  frames = make_chunks(audio, frame_duration)
+  # Perform voice activity detection
+  voiced_frames = []
+  for frame in frames:
+      if len(frame.raw_data) < frame_width * 2:  # Ensure frame is correct length
+          break
+      is_speech = vad.is_speech(frame.raw_data, audio.frame_rate)
+      if is_speech:
+          voiced_frames.append(frame)
+  # Combine voiced frames back to an audio segment
+  voiced_audio = sum(voiced_frames, AudioSegment.silent(duration=0))
+  voiced_audio.export(f"{out_path_name}.wav", format="wav")
 def trim_audio(intervals, input_file_path, output_file_path):
     # load the audio file
     for i, (start_time, end_time) in enumerate(intervals):
         # extract the segment of the audio
         segment = audio[start_time*1000:end_time*1000]
+        output_file_path_i = f"increased_{i}.wav"
+        if len(segment) < 5000:
+            # Calculate how many times to repeat the audio to make it at least 5 seconds long
+            repeat_count = (5000 // len(segment)) + 3
             # Repeat the audio
             longer_audio = segment * repeat_count
             # Save the extended audio
+            print(f"Audio was less than 5 seconds. Extended to {len(longer_audio)} milliseconds.")
             longer_audio.export(output_file_path_i, format='wav')
+            vad(f"{output_file_path_i}", f"{output_file_path}_{i}")
         else:
+            print("Audio is already 5 seconds or longer.")
+            segment.export(f"{output_file_path}_{i}.wav", format='wav')
 import re
         shutil.rmtree("output")
     if multilingual==False:
         for i in subtitle_list:
+            try:
+                os.makedirs("output", exist_ok=True)
+                trim_audio([[i.start_time, i.end_time]], audio_full, f"sliced_audio_{i.index}")
+                print(f"正在合成第{i.index}条语音")
+                print(f"语音内容：{i.text}")
+                convert(apikey, i.text, f"sliced_audio_{i.index}_0.wav", voice, i.text + " " + str(i.index))
+            except Exception:
+                pass
     else:
         for i in subtitle_list:
+            try:
+                os.makedirs("output", exist_ok=True)
+                trim_audio([[i.start_time, i.end_time]], audio_full, f"sliced_audio_{i.index}")
+                print(f"正在合成第{i.index}条语音")
+                print(f"语音内容：{i.text.splitlines()[1]}")
+                convert(apikey, i.text.splitlines()[1], f"sliced_audio_{i.index}_0.wav", voice, i.text.splitlines()[1] + " " + str(i.index))
+            except Exception:
+                pass
     merge_audios("output")
     return "AI配音版.wav"
         </div>
     ''')
+app.launch(share=False, show_error=True)