File size: 3,975 Bytes
459923a e8949c5 459923a e8949c5 459923a e8949c5 459923a e8949c5 bd41e11 e8949c5 3df2266 260f9cb 3df2266 62e3c32 3df2266 62e3c32 f54c768 3df2266 260f9cb e8949c5 260f9cb 459923a e8949c5 459923a e8949c5 459923a 260f9cb e8949c5 260f9cb 459923a e8949c5 424bfb6 459923a e8949c5 459923a d88f981 459923a 1d4bdba 459923a 4cbda75 459923a a2f5f66 459923a e8949c5 459923a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import numpy as np
import soundfile as sf
from scipy import signal
import librosa
import subprocess
import matplotlib.pyplot as plt
from pydub import AudioSegment
def readaud(sound_path):
aud, sr =, dtype=np.float32)
if len(aud.shape) == 2:
aud = aud.mean(1)
if sr != 16000:
alen = int(aud.shape[0] / sr * 16000)
aud = signal.resample(aud, alen)
return aud
def normalise_transcript(xcp):
xcp = xcp.lower()
while ' ' in xcp:
xcp = xcp.replace(' ', ' ')
return xcp
def get_pitch_tracks(sound_path):
orig_ftype = sound_path.split('.')[-1]
if orig_ftype == '.wav':
wav_path = sound_path
aud_data = AudioSegment.from_file(sound_path, orig_ftype)
curdir =["pwd"], capture_output=True, text=True)
curdir = curdir.stdout.splitlines()[0]
fname = sound_path.split('/')[-1].replace(orig_ftype,'')
tmp_path = f'{curdir}/{fname}_tmp.wav'
aud_data.export(tmp_path, format="wav")
wav_path = tmp_path
#print('FILE PATH:', wav_path)
f0_data =["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
f0_data = f0_data.decode()
f0_data = f0_data.split('EST_Header_End\n')[1].splitlines()
f0_data = [l.split(' ') for l in f0_data]
f0_data = [l for l in f0_data if len(l) == 3] # the last line or 2 lines are other info, different format
f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
if orig_ftype != '.wav':["rm", tmp_path])
return f0_data
# transcript could be from a corpus with the wav file,
# input by the user,
# or from a previous speech recognition process
def align_and_graph(sound_path, transcript, aligner_function):
# fetch data
speech = readaud(sound_path)
w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))
# set up the graph shape
rec_start = w_align[0][1]
rec_end = w_align[-1][2]
f0_data = get_pitch_tracks(sound_path)
if f0_data:
f_max = max([f0 for t,f0 in f0_data]) + 50
f_max = 400
fig, axes1 = plt.subplots(figsize=(15,3))
plt.xlim([rec_start, rec_end])
axes1.set_ylim([0.0, f_max])
# draw word boundaries
for w,s,e in w_align:
#plt.text( (s+e)/2 - (len(w)*.01), f_max+15, w, fontsize=15)
plt.text( (s+e)/2, f_max+15, w, fontsize=15, ha="center")
# draw phone / char boundaries
for p,s,e in seg_align:
plt.text( (s+e)/2 - (len(p)*.01), -1*f_max/10, p, fontsize=11, color='teal')
f0c = "blue"
axes1.scatter([t for t,f0 in f0_data], [f0 for t,f0 in f0_data], color=f0c)
w, sr = librosa.load(sound_path)
fr_l = 2048 # librosa default
h_l = 512 # default
rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
rmse = rmse[0]
# show rms energy
axes2 = axes1.twinx()
axes2.set_ylim([0.0, 0.5])
rms_xval = [(h_l*i)/sr for i in range(len(rmse))]
# label the graph
axes1.set_ylabel("Pitch (F0, Hz)", fontsize=14, color="blue")
axes2.set_ylabel("RMS energy", fontsize=14,color="coral")
#plt.title(f'Recording {file_id} (L1 {language_dict[file_id]})', fontsize=15)
return fig
# uppboðssøla bussleiðini viðmerkingar upprunaligur