|
import io |
|
|
|
import librosa |
|
import numpy as np |
|
import pydub |
|
|
|
from src.utils import have_pyrubberband |
|
|
|
|
|
|
|
|
|
|
|
def get_wave_header(frame_input=b"", channels=1, sample_width=2, sample_rate=24000): |
|
|
|
|
|
|
|
import wave |
|
wav_buf = io.BytesIO() |
|
with wave.open(wav_buf, "wb") as vfout: |
|
vfout.setnchannels(channels) |
|
vfout.setsampwidth(sample_width) |
|
vfout.setframerate(sample_rate) |
|
vfout.writeframes(frame_input) |
|
|
|
wav_buf.seek(0) |
|
return wav_buf.read() |
|
|
|
|
|
def prepare_speech(sr=24000): |
|
|
|
return get_wave_header(sample_rate=sr) |
|
|
|
|
|
def get_no_audio(return_as_byte=True, return_nonbyte_as_file=False, sr=None): |
|
if return_as_byte: |
|
return b"" |
|
else: |
|
if return_nonbyte_as_file: |
|
return None |
|
else: |
|
assert sr is not None |
|
return sr, np.array([]).astype(np.int16) |
|
|
|
|
|
def combine_audios(audios, audio=None, channels=1, sample_width=2, sr=24000, expect_bytes=True): |
|
no_audio = get_no_audio(sr=sr) |
|
have_audio = any(x not in [no_audio, None, ''] for x in audios) or audio not in [no_audio, None, ''] |
|
if not have_audio: |
|
return no_audio |
|
|
|
if audio or audios: |
|
is_bytes = expect_bytes |
|
if audios: |
|
is_bytes |= isinstance(audios[0], (bytes, bytearray)) |
|
if audio: |
|
is_bytes |= isinstance(audio, (bytes, bytearray)) |
|
assert audio is None or isinstance(audio, (bytes, bytearray)) |
|
from pydub import AudioSegment |
|
combined_wav = AudioSegment.empty() |
|
for x in audios: |
|
if x is not None: |
|
s = io.BytesIO(x) if is_bytes else x |
|
combined_wav += AudioSegment.from_raw(s, sample_width=sample_width, frame_rate=sr, channels=channels) |
|
if audio is not None: |
|
s = io.BytesIO(audio) if is_bytes else audio |
|
combined_wav += AudioSegment.from_raw(s, sample_width=sample_width, frame_rate=sr, channels=channels) |
|
if is_bytes: |
|
combined_wav = combined_wav.export(format='raw').read() |
|
return combined_wav |
|
|
|
return audio |
|
|
|
|
|
def chunk_speed_change(chunk, sr, tts_speed=1.0): |
|
if tts_speed == 1.0: |
|
return chunk |
|
|
|
if have_pyrubberband: |
|
import pyrubberband as pyrb |
|
chunk = pyrb.time_stretch(chunk, sr, tts_speed) |
|
chunk = (chunk * 32767).astype(np.int16) |
|
return chunk |
|
|
|
if tts_speed < 1.0: |
|
|
|
|
|
|
|
return chunk |
|
|
|
|
|
from pydub import AudioSegment |
|
from pydub.effects import speedup |
|
|
|
s = io.BytesIO(chunk) |
|
channels = 1 |
|
sample_width = 2 |
|
audio = AudioSegment.from_raw(s, sample_width=sample_width, frame_rate=sr, channels=channels) |
|
|
|
chunk = pydub_to_np(speedup(audio, tts_speed, 150)) |
|
|
|
|
|
|
|
|
|
|
|
return chunk |
|
|
|
|
|
def pydub_to_np(audio: pydub.AudioSegment) -> (np.ndarray, int): |
|
""" |
|
Converts pydub audio segment into np.int16 of shape [duration_in_seconds*sample_rate, channels], |
|
""" |
|
return np.array(audio.get_array_of_samples(), dtype=np.int16).reshape((-1, audio.channels)) |
|
|