Update graph.py
Browse files
graph.py
CHANGED
@@ -4,17 +4,17 @@ from scipy import signal
|
|
4 |
import librosa
|
5 |
import subprocess
|
6 |
import matplotlib.pyplot as plt
|
|
|
7 |
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
wav = wav.mean(1)
|
14 |
if sr != 16000:
|
15 |
-
|
16 |
-
|
17 |
-
return
|
18 |
|
19 |
|
20 |
def normalise_transcript(xcp):
|
@@ -25,7 +25,20 @@ def normalise_transcript(xcp):
|
|
25 |
|
26 |
|
27 |
|
28 |
-
def get_pitch_tracks(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
print('FILE PATH:', wav_path)
|
30 |
f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
|
31 |
print('PLAIN:',f0_data)
|
@@ -35,19 +48,25 @@ def get_pitch_tracks(wav_path):
|
|
35 |
#print(f0_data)
|
36 |
f0_data = [l.split(' ') for l in f0_data[:-1]] # the last line is other info
|
37 |
f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
|
|
|
|
|
|
|
|
|
38 |
return f0_data
|
39 |
|
40 |
|
|
|
|
|
41 |
# transcript could be from a corpus with the wav file,
|
42 |
# input by the user,
|
43 |
# or from a previous speech recognition process
|
44 |
-
def align_and_graph(
|
45 |
|
46 |
plt.close('all')
|
47 |
-
|
48 |
|
49 |
# fetch data
|
50 |
-
speech =
|
51 |
w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))
|
52 |
|
53 |
|
@@ -55,7 +74,7 @@ def align_and_graph(wav_path, transcript, aligner_function):
|
|
55 |
rec_start = w_align[0][1]
|
56 |
rec_end = w_align[-1][2]
|
57 |
|
58 |
-
f0_data = get_pitch_tracks(
|
59 |
if f0_data:
|
60 |
f_max = max([f0 for t,f0 in f0_data]) + 50
|
61 |
else:
|
@@ -85,7 +104,7 @@ def align_and_graph(wav_path, transcript, aligner_function):
|
|
85 |
|
86 |
|
87 |
|
88 |
-
w, sr = librosa.load(
|
89 |
fr_l = 2048 # librosa default
|
90 |
h_l = 512 # default
|
91 |
rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
|
|
|
4 |
import librosa
|
5 |
import subprocess
|
6 |
import matplotlib.pyplot as plt
|
7 |
+
from pydub import AudioSegment
|
8 |
|
9 |
|
10 |
+
def readaud(sound_path):
|
11 |
+
aud, sr = sf.read(sound_path, dtype=np.float32)
|
12 |
+
if len(aud.shape) == 2:
|
13 |
+
aud = aud.mean(1)
|
|
|
14 |
if sr != 16000:
|
15 |
+
alen = int(aud.shape[0] / sr * 16000)
|
16 |
+
aud = signal.resample(aud, alen)
|
17 |
+
return aud
|
18 |
|
19 |
|
20 |
def normalise_transcript(xcp):
|
|
|
25 |
|
26 |
|
27 |
|
28 |
+
def get_pitch_tracks(sound_path):
|
29 |
+
|
30 |
+
orig_ftype = sound_path.split('.')[-1]
|
31 |
+
|
32 |
+
if orig_ftype == '.wav':
|
33 |
+
wav_path = sound_path
|
34 |
+
|
35 |
+
else:
|
36 |
+
aud_data = AudioSegment.from_file(sound_path, orig_ftype)
|
37 |
+
curdir = subprocess.run(["pwd"], capture_output=True, text=True).stdout
|
38 |
+
tmp_path = f'{curdir}/tmp.wav'
|
39 |
+
aud_data.export(tmp_path, format="wav")
|
40 |
+
wav_path = tmp_path
|
41 |
+
|
42 |
print('FILE PATH:', wav_path)
|
43 |
f0_data = subprocess.run(["REAPER/build/reaper", "-i", wav_path, '-f', '/dev/stdout', '-a'],capture_output=True).stdout
|
44 |
print('PLAIN:',f0_data)
|
|
|
48 |
#print(f0_data)
|
49 |
f0_data = [l.split(' ') for l in f0_data[:-1]] # the last line is other info
|
50 |
f0_data = [ [float(t), float(f)] for t,v,f in f0_data if v=='1']
|
51 |
+
|
52 |
+
if orig_ftype != '.wav':
|
53 |
+
subprocess.run(["rm", tmp_path])
|
54 |
+
|
55 |
return f0_data
|
56 |
|
57 |
|
58 |
+
|
59 |
+
|
60 |
# transcript could be from a corpus with the wav file,
|
61 |
# input by the user,
|
62 |
# or from a previous speech recognition process
|
63 |
+
def align_and_graph(sound_path, transcript, aligner_function):
|
64 |
|
65 |
plt.close('all')
|
66 |
+
|
67 |
|
68 |
# fetch data
|
69 |
+
speech = readaud(sound_path)
|
70 |
w_align, seg_align = aligner_function(speech,normalise_transcript(transcript))
|
71 |
|
72 |
|
|
|
74 |
rec_start = w_align[0][1]
|
75 |
rec_end = w_align[-1][2]
|
76 |
|
77 |
+
f0_data = get_pitch_tracks(sound_path)
|
78 |
if f0_data:
|
79 |
f_max = max([f0 for t,f0 in f0_data]) + 50
|
80 |
else:
|
|
|
104 |
|
105 |
|
106 |
|
107 |
+
w, sr = librosa.load(sound_path)
|
108 |
fr_l = 2048 # librosa default
|
109 |
h_l = 512 # default
|
110 |
rmse = librosa.feature.rms(y=w, frame_length = fr_l, hop_length = h_l)
|