prosalign / graph.py
clr's picture
Update graph.py
a2f5f66
raw
history blame
3.98 kB
import numpy as np
import soundfile as sf
from scipy import signal
import librosa
import subprocess
import matplotlib.pyplot as plt
from pydub import AudioSegment
def readaud(sound_path):
aud, sr = sf.read(sound_path, dtype=np.float32)
if len(aud.shape) == 2:
aud = aud.mean(1)
if sr != 16000:
alen = int(aud.shape[0] / sr * 16000)
aud = signal.resample(aud, alen)
return aud
def normalise_transcript(xcp):
xcp = xcp.lower()
while ' ' in xcp:
xcp = xcp.replace(' ', ' ')
return xcp
def get_pitch_tracks(sound_path):
orig_ftype = sound_path.split('.')[-1]
if orig_ftype == '.wav':
wav_path = sound_path
else:
aud_data = AudioSegment.from_file(sound_path, orig_ftype)
curdir = subprocess.run(["pwd"], capture_output=True, text=True)
curdir = curdir.stdout.splitlines()[0]
fname = sound_path.split('/')[-1].replace(orig_ftype,'')
tmp_path = f'{curdir}/{fname}_tmp.wav'
aud_data.export(tmp_path, format="wav")
wav_path = tmp_path
#print('FILE PATH:', wav_path)
f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
#print('PLAIN:',f0_data)
f0_data = f0_data.decode()
#print('DECODE-PITCH:',f0_data)
f0_data = f0_data.split('EST_Header_End\n')[1].splitlines()
#print(f0_data)
f0_data = [l.split(' ') for l in f0_data]
f0_data = [l for l in f0_data if len(l) == 3] # the last line or 2 lines are other info, different format
f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
if orig_ftype != '.wav':
subprocess.run(["rm", tmp_path])
return f0_data
# transcript could be from a corpus with the wav file,
# input by the user,
# or from a previous speech recognition process
def align_and_graph(sound_path, transcript, aligner_function):
plt.close('all')
# fetch data
speech = readaud(sound_path)
w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))
# set up the graph shape
rec_start = w_align[0][1]
rec_end = w_align[-1][2]
f0_data = get_pitch_tracks(sound_path)
if f0_data:
f_max = max([f0 for t,f0 in f0_data]) + 50
else:
f_max = 400
fig, axes1 = plt.subplots(figsize=(15,3))
plt.xlim([rec_start, rec_end])
axes1.set_ylim([0.0, f_max])
axes1.get_xaxis().set_visible(False)
# draw word boundaries
for w,s,e in w_align:
plt.vlines(s,0,f_max,linewidth=0.5,color='black')
plt.vlines(e,0,f_max,linewidth=0.5,color='dimgrey')
#plt.text( (s+e)/2 - (len(w)*.01), f_max+15, w, fontsize=15)
plt.text( (s+e)/2, f_max+15, w, fontsize=15, ha="center")
# draw phone / char boundaries
for p,s,e in seg_align:
plt.vlines(s,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
plt.vlines(e,0,f_max,linewidth=0.3,color='cadetblue',linestyle=(0,(10,4)))
plt.text( (s+e)/2 - (len(p)*.01), -1*f_max/10, p, fontsize=11, color='teal')
f0c = "blue"
axes1.scatter([t for t,f0 in f0_data], [f0 for t,f0 in f0_data], color=f0c)
w, sr = librosa.load(sound_path)
fr_l = 2048 # librosa default
h_l = 512 # default
rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
rmse = rmse[0]
# show rms energy
axes2 = axes1.twinx()
axes2.set_ylim([0.0, 0.5])
rms_xval = [(h_l*i)/sr for i in range(len(rmse))]
axes2.plot(rms_xval,rmse,color='peachpuff',linewidth=3.5)
# label the graph
axes1.set_ylabel("Pitch (F0, Hz)", fontsize=14, color="blue")
axes2.set_ylabel("RMS energy", fontsize=14,color="coral")
#plt.title(f'Recording {file_id} (L1 {language_dict[file_id]})', fontsize=15)
#plt.show()
return fig
# uppboðssøla bussleiðini viðmerkingar upprunaligur