|
import numpy as np |
|
import soundfile as sf |
|
from scipy import signal |
|
import librosa |
|
import subprocess |
|
import matplotlib.pyplot as plt |
|
from pydub import AudioSegment |
|
|
|
|
|
def readaud(sound_path): |
|
aud, sr = sf.read(sound_path, dtype=np.float32) |
|
if len(aud.shape) == 2: |
|
aud = aud.mean(1) |
|
if sr != 16000: |
|
alen = int(aud.shape[0] / sr * 16000) |
|
aud = signal.resample(aud, alen) |
|
return aud |
|
|
|
|
|
def normalise_transcript(xcp): |
|
xcp = xcp.lower() |
|
while ' ' in xcp: |
|
xcp = xcp.replace(' ', ' ') |
|
return xcp |
|
|
|
|
|
|
|
def get_pitch_tracks(sound_path): |
|
|
|
orig_ftype = sound_path.split('.')[-1] |
|
|
|
if orig_ftype == '.wav': |
|
wav_path = sound_path |
|
|
|
else: |
|
aud_data = AudioSegment.from_file(sound_path, orig_ftype) |
|
curdir = subprocess.run(["pwd"], capture_output=True, text=True) |
|
curdir = curdir.stdout.splitlines()[0] |
|
fname = sound_path.split('/')[-1].replace(orig_ftype,'') |
|
tmp_path = f'{curdir}/{fname}_tmp.wav' |
|
aud_data.export(tmp_path, format="wav") |
|
wav_path = tmp_path |
|
|
|
|
|
f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout |
|
|
|
f0_data = f0_data.decode() |
|
|
|
f0_data = f0_data.split('EST_Header_End\n')[1].splitlines() |
|
|
|
f0_data = [l.split(' ') for l in f0_data] |
|
f0_data = [l for l in f0_data if len(l) == 3] |
|
f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1'] |
|
|
|
if orig_ftype != '.wav': |
|
subprocess.run(["rm", tmp_path]) |
|
|
|
return f0_data |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def align_and_graph(sound_path, transcript, aligner_function): |
|
|
|
plt.close('all') |
|
|
|
|
|
|
|
speech = readaud(sound_path) |
|
w_align, seg_align = aligner_function(speech,normalise_transcript(transcript)) |
|
|
|
|
|
|
|
rec_start = w_align[0][1] |
|
rec_end = w_align[-1][2] |
|
|
|
f0_data = get_pitch_tracks(sound_path) |
|
if f0_data: |
|
f_max = max([f0 for t,f0 in f0_data]) + 50 |
|
else: |
|
f_max = 400 |
|
|
|
|
|
fig, axes1 = plt.subplots(figsize=(15,3)) |
|
plt.xlim([rec_start, rec_end]) |
|
axes1.set_ylim([0.0, f_max]) |
|
axes1.get_xaxis().set_visible(False) |
|
|
|
|
|
for w,s,e in w_align: |
|
plt.vlines(s,0,f_max,linewidth=0.5,color='black') |
|
plt.vlines(e,0,f_max,linewidth=0.5,color='dimgrey') |
|
|
|
plt.text( (s+e)/2, f_max+15, w, fontsize=15, ha="center") |
|
|
|
|
|
for p,s,e in seg_align: |
|
plt.vlines(s,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4))) |
|
plt.vlines(e,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4))) |
|
plt.text( (s+e)/2 - (len(p)*.01), -1*f_max/10, p, fontsize=11, color='teal') |
|
|
|
|
|
f0c = "blue" |
|
axes1.scatter([t for t,f0 in f0_data], [f0 for t,f0 in f0_data], color=f0c) |
|
|
|
|
|
|
|
w, sr = librosa.load(sound_path) |
|
fr_l = 2048 |
|
h_l = 512 |
|
rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l) |
|
rmse = rmse[0] |
|
|
|
|
|
|
|
axes2 = axes1.twinx() |
|
axes2.set_ylim([0.0, 0.5]) |
|
rms_xval = [(h_l*i)/sr for i in range(len(rmse))] |
|
axes2.plot(rms_xval,rmse,color='peachpuff',linewidth=3.5) |
|
|
|
|
|
|
|
axes1.set_ylabel("Pitch (F0, Hz)", fontsize=14, color="blue") |
|
axes2.set_ylabel("RMS energy", fontsize=14,color="coral") |
|
|
|
|
|
|
|
return fig |
|
|
|
|
|
|
|
|
|
|