11Labs-TTS-Free-VC-NEW

Sleeping

App Files Files Community

kevinwang676 commited on May 15

Commit

6829dab

•

1 Parent(s): 1d505ec

Update app_share.py

Browse files

Files changed (1) hide show

app_share.py +42 -7

app_share.py CHANGED Viewed

@@ -217,7 +217,41 @@ def read_srt(uploaded_file):
     subtitle_list.append(st)
     return subtitle_list
 from pydub import AudioSegment
 def trim_audio(intervals, input_file_path, output_file_path):
     # load the audio file
@@ -227,19 +261,20 @@ def trim_audio(intervals, input_file_path, output_file_path):
     for i, (start_time, end_time) in enumerate(intervals):
         # extract the segment of the audio
         segment = audio[start_time*1000:end_time*1000]
-        output_file_path_i = f"{output_file_path}_{i}.wav"
-        if len(segment) < 3000:
-            # Calculate how many times to repeat the audio to make it at least 2 seconds long
-            repeat_count = (3000 // len(segment)) + 2
             # Repeat the audio
             longer_audio = segment * repeat_count
             # Save the extended audio
-            print(f"Audio was less than 3 seconds. Extended to {len(longer_audio)} milliseconds.")
             longer_audio.export(output_file_path_i, format='wav')
         else:
-            print("Audio is already 3 seconds or longer.")
-            segment.export(output_file_path_i, format='wav')
 import re

     subtitle_list.append(st)
     return subtitle_list
+import webrtcvad
 from pydub import AudioSegment
+from pydub.utils import make_chunks
+def vad(audio_name, out_path_name):
+  audio = AudioSegment.from_file(audio_name, format="wav")
+  # Set the desired sample rate (WebRTC VAD supports only 8000, 16000, 32000, or 48000 Hz)
+  audio = audio.set_frame_rate(48000)
+  # Set single channel (mono)
+  audio = audio.set_channels(1)
+  # Initialize VAD
+  vad = webrtcvad.Vad()
+  # Set aggressiveness mode (an integer between 0 and 3, 3 is the most aggressive)
+  vad.set_mode(3)
+  # Convert pydub audio to bytes
+  frame_duration = 30  # Duration of a frame in ms
+  frame_width = int(audio.frame_rate * frame_duration / 1000)  # width of a frame in samples
+  frames = make_chunks(audio, frame_duration)
+  # Perform voice activity detection
+  voiced_frames = []
+  for frame in frames:
+      if len(frame.raw_data) < frame_width * 2:  # Ensure frame is correct length
+          break
+      is_speech = vad.is_speech(frame.raw_data, audio.frame_rate)
+      if is_speech:
+          voiced_frames.append(frame)
+  # Combine voiced frames back to an audio segment
+  voiced_audio = sum(voiced_frames, AudioSegment.silent(duration=0))
+  voiced_audio.export(f"{out_path_name}.wav", format="wav")
 def trim_audio(intervals, input_file_path, output_file_path):
     # load the audio file
     for i, (start_time, end_time) in enumerate(intervals):
         # extract the segment of the audio
         segment = audio[start_time*1000:end_time*1000]
+        output_file_path_i = f"increased_{i}.wav"
+        if len(segment) < 5000:
+            # Calculate how many times to repeat the audio to make it at least 5 seconds long
+            repeat_count = (5000 // len(segment)) + 3
             # Repeat the audio
             longer_audio = segment * repeat_count
             # Save the extended audio
+            print(f"Audio was less than 5 seconds. Extended to {len(longer_audio)} milliseconds.")
             longer_audio.export(output_file_path_i, format='wav')
+            vad(f"{output_file_path_i}", f"{output_file_path}_{i}")
         else:
+            print("Audio is already 5 seconds or longer.")
+            segment.export(f"{output_file_path}_{i}.wav", format='wav')
 import re